From ba2b9e54ef12126ab982c0ac4ad46ccc020777ae Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Tue, 17 Dec 2024 21:59:57 +0000 Subject: [PATCH] GH-44308: [C++][FS][Azure] Implement SAS token authentication (#45021) ### Rationale for this change SAS token auth is sometimes useful and it the last one we haven't implemented. ### What changes are included in this PR? - Implement `ConfigureSasCredential` - Update `AzureOptions::FromUri` so that simply appending a SAS token to a blob storage URI works. e.g. `AzureOptions::FromUri("abfs://file_system@ account.dfs.core.windows.net/?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04")` - SAS tokens are made up of a bunch of URI query parameters that I'm not sure we can exhaustively list. - Therefore we now assume that any unrecognised URI query parameters are assumed to be part of a SAS token, instead of returning an error status. - Update `CopyFile` to use StartCopyFromUri instead of CopyFromUri - This avoids the need to generate SAS tokens. - Supports blobs bigger than 256MiB - This makes https://github.com/apache/arrow/issues/41315 redundant ### Are these changes tested? Yes - Added new tests for authenticating with SAS and doing some operations including `CopyFile` - Added new tests for `AzureOptions::FromUri` with a SAS token. I also made sure to run the tests which connect to real blob storage. ### Are there any user-facing changes? - SAS token in now supported - Unrecognised URI query parameters are ignored by `AzureOptions::FromUri` instead of failing fast. IMO this is a regression but still the best option to support SAS token. * GitHub Issue: #44308 Authored-by: Thomas Newton Signed-off-by: Sutou Kouhei --- cpp/src/arrow/filesystem/azurefs.cc | 89 ++++++++++++++---------- cpp/src/arrow/filesystem/azurefs.h | 14 ++-- cpp/src/arrow/filesystem/azurefs_test.cc | 88 +++++++++++++++++++++++ 3 files changed, 148 insertions(+), 43 deletions(-) diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 78f4ad1edd9a9..4638bb12c783c 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { std::string tenant_id; std::string client_id; std::string client_secret; + + // These query parameters are the union of the following docs: + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas + // (excluding parameters for table storage only) + // https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas + static const std::set sas_token_query_parameters = { + "sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr", + "skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid", + "scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct", + }; + ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items()); for (const auto& kv : options_items) { if (kv.first == "blob_storage_authority") { @@ -147,6 +159,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { } else if (kv.first == "background_writes") { ARROW_ASSIGN_OR_RAISE(background_writes, ::arrow::internal::ParseBoolean(kv.second)); + } else if (sas_token_query_parameters.find(kv.first) != + sas_token_query_parameters.end()) { + credential_kind = CredentialKind::kSASToken; } else { return Status::Invalid( "Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'"); @@ -180,6 +195,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) { case CredentialKind::kEnvironment: RETURN_NOT_OK(ConfigureEnvironmentCredential()); break; + case CredentialKind::kSASToken: + // Reconstructing the SAS token without the other URI query parameters is awkward + // because some parts are URI escaped and some parts are not. Instead we just + // pass through the entire query string and Azure ignores the extra query + // parameters. + RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string())); + break; default: // Default credential break; @@ -225,7 +247,6 @@ Result AzureOptions::FromUri(const std::string& uri_string, } bool AzureOptions::Equals(const AzureOptions& other) const { - // TODO(GH-38598): update here when more auth methods are added. const bool equals = blob_storage_authority == other.blob_storage_authority && dfs_storage_authority == other.dfs_storage_authority && blob_storage_scheme == other.blob_storage_scheme && @@ -243,6 +264,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const { case CredentialKind::kStorageSharedKey: return storage_shared_key_credential_->AccountName == other.storage_shared_key_credential_->AccountName; + case CredentialKind::kSASToken: + return sas_token_ == other.sas_token_; case CredentialKind::kClientSecret: case CredentialKind::kCLI: case CredentialKind::kManagedIdentity: @@ -311,6 +334,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke return Status::OK(); } +Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) { + credential_kind_ = CredentialKind::kSASToken; + if (account_name.empty()) { + return Status::Invalid("AzureOptions doesn't contain a valid account name"); + } + sas_token_ = sas_token; + return Status::OK(); +} + Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret) { @@ -372,6 +404,9 @@ Result> AzureOptions::MakeBlobServiceC case CredentialKind::kStorageSharedKey: return std::make_unique(AccountBlobUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSASToken: + return std::make_unique(AccountBlobUrl(account_name) + + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } @@ -404,29 +439,13 @@ AzureOptions::MakeDataLakeServiceClient() const { case CredentialKind::kStorageSharedKey: return std::make_unique( AccountDfsUrl(account_name), storage_shared_key_credential_); + case CredentialKind::kSASToken: + return std::make_unique( + AccountBlobUrl(account_name) + sas_token_); } return Status::Invalid("AzureOptions doesn't contain a valid auth configuration"); } -Result AzureOptions::GenerateSASToken( - Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const { - using SasProtocol = Storage::Sas::SasProtocol; - builder->Protocol = - blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly; - if (storage_shared_key_credential_) { - return builder->GenerateSasToken(*storage_shared_key_credential_); - } else { - // GH-39344: This part isn't tested. This may not work. - try { - auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn); - return builder->GenerateSasToken(delegation_key_response.Value, account_name); - } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "GetUserDelegationKey failed for '", - client->GetUrl(), "'."); - } - } -} - namespace { // An AzureFileSystem represents an Azure storage account. An AzureLocation describes a @@ -3161,19 +3180,7 @@ class AzureFileSystem::Impl { if (src == dest) { return Status::OK(); } - std::string sas_token; - { - Storage::Sas::BlobSasBuilder builder; - std::chrono::seconds available_period(60); - builder.ExpiresOn = std::chrono::system_clock::now() + available_period; - builder.BlobContainerName = src.container; - builder.BlobName = src.path; - builder.Resource = Storage::Sas::BlobSasResource::Blob; - builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read); - ARROW_ASSIGN_OR_RAISE( - sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get())); - } - auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token; + auto src_url = GetBlobClient(src.container, src.path).GetUrl(); auto dest_blob_client = GetBlobClient(dest.container, dest.path); if (!dest.path.empty()) { auto dest_parent = dest.parent(); @@ -3186,9 +3193,21 @@ class AzureFileSystem::Impl { } } try { - dest_blob_client.CopyFromUri(src_url); + // We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger + // than 256 MiB and it doesn't require generating a SAS token to authenticate + // reading a source blob in the same storage account. + auto copy_operation = dest_blob_client.StartCopyFromUri(src_url); + // For large blobs, the copy operation may be slow so we need to poll until it + // completes. We use a polling interval of 1 second. + copy_operation.PollUntilDone(std::chrono::milliseconds(1000)); } catch (const Storage::StorageException& exception) { - return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ", + // StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed. + return ExceptionToStatus( + exception, "Failed to start blob copy or poll status of ongoing copy. (", + src_url, " -> ", dest_blob_client.GetUrl(), ")"); + } catch (const Azure::Core::RequestFailedException& exception) { + // A GetProperties call inside PollUntilDone returned a failed CopyStatus. + return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ", dest_blob_client.GetUrl(), ")"); } return Status::OK(); diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index c5e5091256959..ee0956afdd7a9 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs { class BlobServiceClient; } -namespace Azure::Storage::Sas { -struct BlobSasBuilder; -} - namespace Azure::Storage::Files::DataLake { class DataLakeFileSystemClient; class DataLakeServiceClient; @@ -120,6 +116,7 @@ struct ARROW_EXPORT AzureOptions { kDefault, kAnonymous, kStorageSharedKey, + kSASToken, kClientSecret, kManagedIdentity, kCLI, @@ -129,6 +126,7 @@ struct ARROW_EXPORT AzureOptions { std::shared_ptr storage_shared_key_credential_; + std::string sas_token_; mutable std::shared_ptr token_credential_; public: @@ -180,6 +178,9 @@ struct ARROW_EXPORT AzureOptions { /// AzureOptions::ConfigureClientSecretCredential() is called. /// * client_secret: You must specify "tenant_id" and "client_id" /// too. AzureOptions::ConfigureClientSecretCredential() is called. + /// * A SAS token is made up of several query parameters. Appending a SAS + /// token to the URI configures SAS token auth by calling + /// AzureOptions::ConfigureSASCredential(). /// /// [1]: /// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri @@ -189,6 +190,7 @@ struct ARROW_EXPORT AzureOptions { Status ConfigureDefaultCredential(); Status ConfigureAnonymousCredential(); Status ConfigureAccountKeyCredential(const std::string& account_key); + Status ConfigureSASCredential(const std::string& sas_token); Status ConfigureClientSecretCredential(const std::string& tenant_id, const std::string& client_id, const std::string& client_secret); @@ -207,10 +209,6 @@ struct ARROW_EXPORT AzureOptions { Result> MakeDataLakeServiceClient() const; - - Result GenerateSASToken( - Azure::Storage::Sas::BlobSasBuilder* builder, - Azure::Storage::Blobs::BlobServiceClient* client) const; }; /// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 9842b39227e5f..7c1d450051901 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -714,6 +714,36 @@ class TestAzureOptions : public ::testing::Test { ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment); } + void TestFromUriCredentialSASToken() { + const std::string sas_token = + "?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://file_system@account.dfs.core.windows.net/" + sas_token, nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); + ASSERT_EQ(options.sas_token_, sas_token); + } + + void TestFromUriCredentialSASTokenWithOtherParameters() { + const std::string uri_query_string = + "?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%" + "2B1SqLxPK%" + "2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04"; + ASSERT_OK_AND_ASSIGN( + auto options, + AzureOptions::FromUri( + "abfs://account@127.0.0.1:10000/container/dir/blob" + uri_query_string, + nullptr)); + ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken); + ASSERT_EQ(options.sas_token_, uri_query_string); + ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000"); + ASSERT_EQ(options.blob_storage_scheme, "http"); + ASSERT_EQ(options.dfs_storage_scheme, "http"); + } + void TestFromUriCredentialInvalid() { ASSERT_RAISES(Invalid, AzureOptions::FromUri( "abfs://file_system@account.dfs.core.windows.net/dir/file?" @@ -801,6 +831,10 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) { TEST_F(TestAzureOptions, FromUriCredentialEnvironment) { TestFromUriCredentialEnvironment(); } +TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); } +TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) { + TestFromUriCredentialSASTokenWithOtherParameters(); +} TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); } TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) { TestFromUriBlobStorageAuthority(); @@ -936,6 +970,20 @@ class TestAzureFileSystem : public ::testing::Test { .Value; } + Result GetContainerSASToken( + const std::string& container_name, + Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) { + std::string sas_token; + Azure::Storage::Sas::BlobSasBuilder builder; + std::chrono::seconds available_period(60); + builder.ExpiresOn = std::chrono::system_clock::now() + available_period; + builder.BlobContainerName = container_name; + builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer; + builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All); + builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp; + return builder.GenerateSasToken(storage_shared_key_credential); + } + void UploadLines(const std::vector& lines, const std::string& path, int total_size) { ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {})); @@ -1619,6 +1667,31 @@ class TestAzureFileSystem : public ::testing::Test { AssertObjectContents(fs.get(), path, payload); } + void TestSASCredential() { + auto data = SetUpPreexistingData(); + + ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv()); + ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env)); + ASSERT_OK_AND_ASSIGN( + auto sas_token, + GetContainerSASToken(data.container_name, + Azure::Storage::StorageSharedKeyCredential( + env->account_name(), env->account_key()))); + // AzureOptions::FromUri will not cut off extra query parameters that it consumes, so + // make sure these don't cause problems. + ARROW_EXPECT_OK(options.ConfigureSASCredential( + "?blob_storage_authority=dummy_value0&" + sas_token.substr(1) + + "&credential_kind=dummy-value1")); + EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options)); + + AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File); + + // Test CopyFile because the most obvious implementation requires generating a SAS + // token at runtime which doesn't work when the original auth is SAS token. + ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy")); + AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File); + } + private: using StringMatcher = ::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher>; @@ -2330,6 +2403,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) { TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); } +TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) { + this->TestSASCredential(); +} + // Tests using Azurite (the local Azure emulator) TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) { @@ -2636,6 +2713,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) { EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); } +TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) { + auto data = SetUpPreexistingData(); + auto data2 = SetUpPreexistingData(); + const auto destination_path = data2.ContainerPath("copy-destionation"); + ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path)); + ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path)); + ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info)); + ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024)); + EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString()); +} + TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) { auto data = SetUpPreexistingData(); ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath()));