Skip to content

Commit

Permalink
apacheGH-44308: [C++][FS][Azure] Implement SAS token authentication (a…
Browse files Browse the repository at this point in the history
…pache#45021)

### Rationale for this change
SAS token auth is sometimes useful and it the last one we haven't implemented. 

### What changes are included in this PR?
- Implement `ConfigureSasCredential`
- Update `AzureOptions::FromUri` so that simply appending a SAS token to a blob storage URI works. e.g. `AzureOptions::FromUri("abfs://file_system@ account.dfs.core.windows.net/?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04")`
  - SAS tokens are made up of a bunch of URI query parameters that I'm not sure we can exhaustively list.
  - Therefore we now assume that any unrecognised URI query parameters are assumed to be part of a SAS token, instead of returning an error status. 
- Update `CopyFile` to use StartCopyFromUri instead of CopyFromUri
  - This avoids the need to generate SAS tokens. 
  - Supports blobs bigger than 256MiB
  - This makes apache#41315 redundant
 
### Are these changes tested?
Yes
- Added new tests for authenticating with SAS and doing some operations including `CopyFile`
- Added new tests for `AzureOptions::FromUri` with a SAS token. 

I also made sure to run the tests which connect to real blob storage. 

### Are there any user-facing changes?
- SAS token in now supported
- Unrecognised URI query parameters are ignored by `AzureOptions::FromUri` instead of failing fast. IMO this is a regression but still the best option to support SAS token. 

* GitHub Issue: apache#44308

Authored-by: Thomas Newton <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
Tom-Newton authored Dec 17, 2024
1 parent 2533a9e commit ba2b9e5
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 43 deletions.
89 changes: 54 additions & 35 deletions cpp/src/arrow/filesystem/azurefs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,18 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
std::string tenant_id;
std::string client_id;
std::string client_secret;

// These query parameters are the union of the following docs:
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-account-sas#specify-the-account-sas-parameters
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-service-sas#construct-a-service-sas
// (excluding parameters for table storage only)
// https://learn.microsoft.com/en-us/rest/api/storageservices/create-user-delegation-sas#construct-a-user-delegation-sas
static const std::set<std::string> sas_token_query_parameters = {
"sv", "ss", "sr", "st", "se", "sp", "si", "sip", "spr",
"skoid", "sktid", "srt", "skt", "ske", "skv", "sks", "saoid", "suoid",
"scid", "sdd", "ses", "sig", "rscc", "rscd", "rsce", "rscl", "rsct",
};

ARROW_ASSIGN_OR_RAISE(const auto options_items, uri.query_items());
for (const auto& kv : options_items) {
if (kv.first == "blob_storage_authority") {
Expand Down Expand Up @@ -147,6 +159,9 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
} else if (kv.first == "background_writes") {
ARROW_ASSIGN_OR_RAISE(background_writes,
::arrow::internal::ParseBoolean(kv.second));
} else if (sas_token_query_parameters.find(kv.first) !=
sas_token_query_parameters.end()) {
credential_kind = CredentialKind::kSASToken;
} else {
return Status::Invalid(
"Unexpected query parameter in Azure Blob File System URI: '", kv.first, "'");
Expand Down Expand Up @@ -180,6 +195,13 @@ Status AzureOptions::ExtractFromUriQuery(const Uri& uri) {
case CredentialKind::kEnvironment:
RETURN_NOT_OK(ConfigureEnvironmentCredential());
break;
case CredentialKind::kSASToken:
// Reconstructing the SAS token without the other URI query parameters is awkward
// because some parts are URI escaped and some parts are not. Instead we just
// pass through the entire query string and Azure ignores the extra query
// parameters.
RETURN_NOT_OK(ConfigureSASCredential("?" + uri.query_string()));
break;
default:
// Default credential
break;
Expand Down Expand Up @@ -225,7 +247,6 @@ Result<AzureOptions> AzureOptions::FromUri(const std::string& uri_string,
}

bool AzureOptions::Equals(const AzureOptions& other) const {
// TODO(GH-38598): update here when more auth methods are added.
const bool equals = blob_storage_authority == other.blob_storage_authority &&
dfs_storage_authority == other.dfs_storage_authority &&
blob_storage_scheme == other.blob_storage_scheme &&
Expand All @@ -243,6 +264,8 @@ bool AzureOptions::Equals(const AzureOptions& other) const {
case CredentialKind::kStorageSharedKey:
return storage_shared_key_credential_->AccountName ==
other.storage_shared_key_credential_->AccountName;
case CredentialKind::kSASToken:
return sas_token_ == other.sas_token_;
case CredentialKind::kClientSecret:
case CredentialKind::kCLI:
case CredentialKind::kManagedIdentity:
Expand Down Expand Up @@ -311,6 +334,15 @@ Status AzureOptions::ConfigureAccountKeyCredential(const std::string& account_ke
return Status::OK();
}

Status AzureOptions::ConfigureSASCredential(const std::string& sas_token) {
credential_kind_ = CredentialKind::kSASToken;
if (account_name.empty()) {
return Status::Invalid("AzureOptions doesn't contain a valid account name");
}
sas_token_ = sas_token;
return Status::OK();
}

Status AzureOptions::ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret) {
Expand Down Expand Up @@ -372,6 +404,9 @@ Result<std::unique_ptr<Blobs::BlobServiceClient>> AzureOptions::MakeBlobServiceC
case CredentialKind::kStorageSharedKey:
return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name),
storage_shared_key_credential_);
case CredentialKind::kSASToken:
return std::make_unique<Blobs::BlobServiceClient>(AccountBlobUrl(account_name) +
sas_token_);
}
return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
}
Expand Down Expand Up @@ -404,29 +439,13 @@ AzureOptions::MakeDataLakeServiceClient() const {
case CredentialKind::kStorageSharedKey:
return std::make_unique<DataLake::DataLakeServiceClient>(
AccountDfsUrl(account_name), storage_shared_key_credential_);
case CredentialKind::kSASToken:
return std::make_unique<DataLake::DataLakeServiceClient>(
AccountBlobUrl(account_name) + sas_token_);
}
return Status::Invalid("AzureOptions doesn't contain a valid auth configuration");
}

Result<std::string> AzureOptions::GenerateSASToken(
Storage::Sas::BlobSasBuilder* builder, Blobs::BlobServiceClient* client) const {
using SasProtocol = Storage::Sas::SasProtocol;
builder->Protocol =
blob_storage_scheme == "http" ? SasProtocol::HttpsAndHttp : SasProtocol::HttpsOnly;
if (storage_shared_key_credential_) {
return builder->GenerateSasToken(*storage_shared_key_credential_);
} else {
// GH-39344: This part isn't tested. This may not work.
try {
auto delegation_key_response = client->GetUserDelegationKey(builder->ExpiresOn);
return builder->GenerateSasToken(delegation_key_response.Value, account_name);
} catch (const Storage::StorageException& exception) {
return ExceptionToStatus(exception, "GetUserDelegationKey failed for '",
client->GetUrl(), "'.");
}
}
}

namespace {

// An AzureFileSystem represents an Azure storage account. An AzureLocation describes a
Expand Down Expand Up @@ -3161,19 +3180,7 @@ class AzureFileSystem::Impl {
if (src == dest) {
return Status::OK();
}
std::string sas_token;
{
Storage::Sas::BlobSasBuilder builder;
std::chrono::seconds available_period(60);
builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
builder.BlobContainerName = src.container;
builder.BlobName = src.path;
builder.Resource = Storage::Sas::BlobSasResource::Blob;
builder.SetPermissions(Storage::Sas::BlobSasPermissions::Read);
ARROW_ASSIGN_OR_RAISE(
sas_token, options_.GenerateSASToken(&builder, blob_service_client_.get()));
}
auto src_url = GetBlobClient(src.container, src.path).GetUrl() + sas_token;
auto src_url = GetBlobClient(src.container, src.path).GetUrl();
auto dest_blob_client = GetBlobClient(dest.container, dest.path);
if (!dest.path.empty()) {
auto dest_parent = dest.parent();
Expand All @@ -3186,9 +3193,21 @@ class AzureFileSystem::Impl {
}
}
try {
dest_blob_client.CopyFromUri(src_url);
// We use StartCopyFromUri instead of CopyFromUri because it supports blobs larger
// than 256 MiB and it doesn't require generating a SAS token to authenticate
// reading a source blob in the same storage account.
auto copy_operation = dest_blob_client.StartCopyFromUri(src_url);
// For large blobs, the copy operation may be slow so we need to poll until it
// completes. We use a polling interval of 1 second.
copy_operation.PollUntilDone(std::chrono::milliseconds(1000));
} catch (const Storage::StorageException& exception) {
return ExceptionToStatus(exception, "Failed to copy a blob. (", src_url, " -> ",
// StartCopyFromUri failed or a GetProperties call inside PollUntilDone failed.
return ExceptionToStatus(
exception, "Failed to start blob copy or poll status of ongoing copy. (",
src_url, " -> ", dest_blob_client.GetUrl(), ")");
} catch (const Azure::Core::RequestFailedException& exception) {
// A GetProperties call inside PollUntilDone returned a failed CopyStatus.
return ExceptionToStatus(exception, "Failed to copy blob. (", src_url, " -> ",
dest_blob_client.GetUrl(), ")");
}
return Status::OK();
Expand Down
14 changes: 6 additions & 8 deletions cpp/src/arrow/filesystem/azurefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ namespace Azure::Storage::Blobs {
class BlobServiceClient;
}

namespace Azure::Storage::Sas {
struct BlobSasBuilder;
}

namespace Azure::Storage::Files::DataLake {
class DataLakeFileSystemClient;
class DataLakeServiceClient;
Expand Down Expand Up @@ -120,6 +116,7 @@ struct ARROW_EXPORT AzureOptions {
kDefault,
kAnonymous,
kStorageSharedKey,
kSASToken,
kClientSecret,
kManagedIdentity,
kCLI,
Expand All @@ -129,6 +126,7 @@ struct ARROW_EXPORT AzureOptions {

std::shared_ptr<Azure::Storage::StorageSharedKeyCredential>
storage_shared_key_credential_;
std::string sas_token_;
mutable std::shared_ptr<Azure::Core::Credentials::TokenCredential> token_credential_;

public:
Expand Down Expand Up @@ -180,6 +178,9 @@ struct ARROW_EXPORT AzureOptions {
/// AzureOptions::ConfigureClientSecretCredential() is called.
/// * client_secret: You must specify "tenant_id" and "client_id"
/// too. AzureOptions::ConfigureClientSecretCredential() is called.
/// * A SAS token is made up of several query parameters. Appending a SAS
/// token to the URI configures SAS token auth by calling
/// AzureOptions::ConfigureSASCredential().
///
/// [1]:
/// https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction-abfs-uri
Expand All @@ -189,6 +190,7 @@ struct ARROW_EXPORT AzureOptions {
Status ConfigureDefaultCredential();
Status ConfigureAnonymousCredential();
Status ConfigureAccountKeyCredential(const std::string& account_key);
Status ConfigureSASCredential(const std::string& sas_token);
Status ConfigureClientSecretCredential(const std::string& tenant_id,
const std::string& client_id,
const std::string& client_secret);
Expand All @@ -207,10 +209,6 @@ struct ARROW_EXPORT AzureOptions {

Result<std::unique_ptr<Azure::Storage::Files::DataLake::DataLakeServiceClient>>
MakeDataLakeServiceClient() const;

Result<std::string> GenerateSASToken(
Azure::Storage::Sas::BlobSasBuilder* builder,
Azure::Storage::Blobs::BlobServiceClient* client) const;
};

/// \brief FileSystem implementation backed by Azure Blob Storage (ABS) [1] and
Expand Down
88 changes: 88 additions & 0 deletions cpp/src/arrow/filesystem/azurefs_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -714,6 +714,36 @@ class TestAzureOptions : public ::testing::Test {
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kEnvironment);
}

void TestFromUriCredentialSASToken() {
const std::string sas_token =
"?se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%2B1SqLxPK%"
"2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
ASSERT_OK_AND_ASSIGN(
auto options,
AzureOptions::FromUri(
"abfs://[email protected]/" + sas_token, nullptr));
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
ASSERT_EQ(options.sas_token_, sas_token);
}

void TestFromUriCredentialSASTokenWithOtherParameters() {
const std::string uri_query_string =
"?enable_tls=false&se=2024-12-12T18:57:47Z&sig=pAs7qEBdI6sjUhqX1nrhNAKsTY%"
"2B1SqLxPK%"
"2BbAxLiopw%3D&sp=racwdxylti&spr=https,http&sr=c&sv=2024-08-04";
ASSERT_OK_AND_ASSIGN(
auto options,
AzureOptions::FromUri(
"abfs://[email protected]:10000/container/dir/blob" + uri_query_string,
nullptr));
ASSERT_EQ(options.credential_kind_, AzureOptions::CredentialKind::kSASToken);
ASSERT_EQ(options.sas_token_, uri_query_string);
ASSERT_EQ(options.blob_storage_authority, "127.0.0.1:10000");
ASSERT_EQ(options.dfs_storage_authority, "127.0.0.1:10000");
ASSERT_EQ(options.blob_storage_scheme, "http");
ASSERT_EQ(options.dfs_storage_scheme, "http");
}

void TestFromUriCredentialInvalid() {
ASSERT_RAISES(Invalid, AzureOptions::FromUri(
"abfs://[email protected]/dir/file?"
Expand Down Expand Up @@ -801,6 +831,10 @@ TEST_F(TestAzureOptions, FromUriCredentialWorkloadIdentity) {
TEST_F(TestAzureOptions, FromUriCredentialEnvironment) {
TestFromUriCredentialEnvironment();
}
TEST_F(TestAzureOptions, FromUriCredentialSASToken) { TestFromUriCredentialSASToken(); }
TEST_F(TestAzureOptions, FromUriCredentialSASTokenWithOtherParameters) {
TestFromUriCredentialSASTokenWithOtherParameters();
}
TEST_F(TestAzureOptions, FromUriCredentialInvalid) { TestFromUriCredentialInvalid(); }
TEST_F(TestAzureOptions, FromUriBlobStorageAuthority) {
TestFromUriBlobStorageAuthority();
Expand Down Expand Up @@ -936,6 +970,20 @@ class TestAzureFileSystem : public ::testing::Test {
.Value;
}

Result<std::string> GetContainerSASToken(
const std::string& container_name,
Azure::Storage::StorageSharedKeyCredential storage_shared_key_credential) {
std::string sas_token;
Azure::Storage::Sas::BlobSasBuilder builder;
std::chrono::seconds available_period(60);
builder.ExpiresOn = std::chrono::system_clock::now() + available_period;
builder.BlobContainerName = container_name;
builder.Resource = Azure::Storage::Sas::BlobSasResource::BlobContainer;
builder.SetPermissions(Azure::Storage::Sas::BlobContainerSasPermissions::All);
builder.Protocol = Azure::Storage::Sas::SasProtocol::HttpsAndHttp;
return builder.GenerateSasToken(storage_shared_key_credential);
}

void UploadLines(const std::vector<std::string>& lines, const std::string& path,
int total_size) {
ASSERT_OK_AND_ASSIGN(auto output, fs()->OpenOutputStream(path, {}));
Expand Down Expand Up @@ -1619,6 +1667,31 @@ class TestAzureFileSystem : public ::testing::Test {
AssertObjectContents(fs.get(), path, payload);
}

void TestSASCredential() {
auto data = SetUpPreexistingData();

ASSERT_OK_AND_ASSIGN(auto env, GetAzureEnv());
ASSERT_OK_AND_ASSIGN(auto options, MakeOptions(env));
ASSERT_OK_AND_ASSIGN(
auto sas_token,
GetContainerSASToken(data.container_name,
Azure::Storage::StorageSharedKeyCredential(
env->account_name(), env->account_key())));
// AzureOptions::FromUri will not cut off extra query parameters that it consumes, so
// make sure these don't cause problems.
ARROW_EXPECT_OK(options.ConfigureSASCredential(
"?blob_storage_authority=dummy_value0&" + sas_token.substr(1) +
"&credential_kind=dummy-value1"));
EXPECT_OK_AND_ASSIGN(auto fs, AzureFileSystem::Make(options));

AssertFileInfo(fs.get(), data.ObjectPath(), FileType::File);

// Test CopyFile because the most obvious implementation requires generating a SAS
// token at runtime which doesn't work when the original auth is SAS token.
ASSERT_OK(fs->CopyFile(data.ObjectPath(), data.ObjectPath() + "_copy"));
AssertFileInfo(fs.get(), data.ObjectPath() + "_copy", FileType::File);
}

private:
using StringMatcher =
::testing::PolymorphicMatcher<::testing::internal::HasSubstrMatcher<std::string>>;
Expand Down Expand Up @@ -2330,6 +2403,10 @@ TYPED_TEST(TestAzureFileSystemOnAllScenarios, CreateContainerFromPath) {

TYPED_TEST(TestAzureFileSystemOnAllScenarios, MovePath) { this->TestMovePath(); }

TYPED_TEST(TestAzureFileSystemOnAllScenarios, SASCredential) {
this->TestSASCredential();
}

// Tests using Azurite (the local Azure emulator)

TEST_F(TestAzuriteFileSystem, CheckIfHierarchicalNamespaceIsEnabledRuntimeError) {
Expand Down Expand Up @@ -2636,6 +2713,17 @@ TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationNonexistent) {
EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
}

TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationDifferentContainer) {
auto data = SetUpPreexistingData();
auto data2 = SetUpPreexistingData();
const auto destination_path = data2.ContainerPath("copy-destionation");
ASSERT_OK(fs()->CopyFile(data.ObjectPath(), destination_path));
ASSERT_OK_AND_ASSIGN(auto info, fs()->GetFileInfo(destination_path));
ASSERT_OK_AND_ASSIGN(auto stream, fs()->OpenInputStream(info));
ASSERT_OK_AND_ASSIGN(auto buffer, stream->Read(1024));
EXPECT_EQ(PreexistingData::kLoremIpsum, buffer->ToString());
}

TEST_F(TestAzuriteFileSystem, CopyFileSuccessDestinationSame) {
auto data = SetUpPreexistingData();
ASSERT_OK(fs()->CopyFile(data.ObjectPath(), data.ObjectPath()));
Expand Down

0 comments on commit ba2b9e5

Please sign in to comment.