From c39c31bbad6f7f682d4ca04efe200bb070e79953 Mon Sep 17 00:00:00 2001 From: Thomas Newton Date: Thu, 9 Nov 2023 04:11:22 +0000 Subject: [PATCH] GH-38335: [C++] Implement `GetFileInfo` for a single file in Azure filesystem (#38505) ### Rationale for this change `GetFileInfo` is an important part of an Arrow filesystem implementation. ### What changes are included in this PR? - Start `azurefs_internal` similar to GCS and S3 filesystems. - Implement `HierarchicalNamespaceDetector`. - This does not use the obvious and simple implementation. It uses a more complicated option inspired by `hadoop-azure` that avoids requiring the significantly elevated permissions needed for `blob_service_client->GetAccountInfo()`. - This can't be detected an initialisation time of the filesystem because it requires a `container_name`. Its packed into its only class so that the result can be cached. - Implement `GetFileInfo` for single paths. - Supports hierarchical or flat namespace accounts and takes advantage of hierarchical namespace where possible to avoid unnecessary extra calls to blob storage. The performance difference is actually noticeable just from running the `GetFileInfoObjectWithNestedStructure` test against real flat and hierarchical accounts. Its about 3 seconds with hierarchical namespace or 5 seconds with a flat namespace. - Update tests with TODO(GH-38335) to now use this implementation of `GetFileInfo` to replace the temporary direct Azure SDK usage. - Rename the main test fixture and introduce new ones for connecting to real blob storage. If details of real blob storage is not provided then the real blob storage tests will be skipped. ### Are these changes tested? Yes. There are new Azurite based tests for everything that can be tested with Azurite. There are also some tests that are designed to test against a real blob storage account. This is because [Azurite cannot emulate a hierarchical namespace account](https://github.com/Azure/Azurite/issues/553). Additionally some of the behaviour used to detect a hierarchical namespace account is different on Azurite compared to a real flat namespace account. These tests will be automatically skipped unless environment variables are provided with details for connecting to the relevant real storage accounts. Initially I based the tests on the GCS filesystem but I added a few extras where I thought it was appropriate. ### Are there any user-facing changes? Yes. `GetFileInfo` is now supported on the Azure filesystem. * Closes: #38335 Lead-authored-by: Thomas Newton Co-authored-by: Sutou Kouhei Signed-off-by: Sutou Kouhei --- cpp/src/arrow/CMakeLists.txt | 4 +- cpp/src/arrow/filesystem/azurefs.cc | 158 ++++++++-- cpp/src/arrow/filesystem/azurefs_internal.cc | 88 ++++++ cpp/src/arrow/filesystem/azurefs_internal.h | 42 +++ cpp/src/arrow/filesystem/azurefs_test.cc | 288 ++++++++++++++----- cpp/src/arrow/filesystem/path_util.cc | 11 +- cpp/src/arrow/filesystem/path_util.h | 3 + 7 files changed, 497 insertions(+), 97 deletions(-) create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.cc create mode 100644 cpp/src/arrow/filesystem/azurefs_internal.h diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a6117011535e..101b089ba837f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -502,8 +502,8 @@ if(ARROW_FILESYSTEM) filesystem/util_internal.cc) if(ARROW_AZURE) - list(APPEND ARROW_SRCS filesystem/azurefs.cc) - set_source_files_properties(filesystem/azurefs.cc + list(APPEND ARROW_SRCS filesystem/azurefs.cc filesystem/azurefs_internal.cc) + set_source_files_properties(filesystem/azurefs.cc filesystem/azurefs_internal.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) endif() diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a04338d999d11..6359183d90bb4 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,8 +16,10 @@ // under the License. #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include +#include #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" @@ -59,6 +61,7 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n credentials_kind = AzureCredentialsKind::StorageCredentials; return Status::OK(); } + namespace { // An AzureFileSystem represents a single Azure storage account. AzurePath describes a @@ -79,18 +82,17 @@ struct AzurePath { "Expected an Azure object path of the form 'container/path...', got a URI: '", s, "'"); } - const auto src = internal::RemoveTrailingSlash(s); - auto first_sep = src.find_first_of(internal::kSep); + auto first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { - return AzurePath{std::string(src), std::string(src), "", {}}; + return AzurePath{std::string(s), std::string(s), "", {}}; } AzurePath path; - path.full_path = std::string(src); - path.container = std::string(src.substr(0, first_sep)); - path.path_to_file = std::string(src.substr(first_sep + 1)); + path.full_path = std::string(s); + path.container = std::string(s.substr(0, first_sep)); + path.path_to_file = std::string(s.substr(first_sep + 1)); path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); RETURN_NOT_OK(Validate(path)); return path; @@ -146,11 +148,6 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); -} - template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -316,11 +313,13 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Could be either container or blob not found. return PathNotFound(path_); } - return ErrorToStatus( - "When fetching properties for '" + blob_client_->GetUrl() + "': ", exception); + return internal::ExceptionToStatus( + "GetProperties failed for '" + blob_client_->GetUrl() + + "' with an unexpected Azure error. Can not initialise an ObjectInputFile " + "without knowing the file size.", + exception); } } @@ -397,10 +396,12 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus("When reading from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + " for " + - std::to_string(nbytes) + " bytes: ", - exception); + return internal::ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + + " for " + std::to_string(nbytes) + + " bytes failed with an Azure error. ReadAt " + "failed to read the required byte range.", + exception); } } @@ -444,7 +445,6 @@ class ObjectInputFile final : public io::RandomAccessFile { int64_t content_length_ = kNoSize; std::shared_ptr metadata_; }; - } // namespace // ----------------------------------------------------------------------- @@ -453,27 +453,136 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - std::shared_ptr service_client_; + std::unique_ptr + datalake_service_client_; + std::unique_ptr blob_service_client_; AzureOptions options_; + internal::HierarchicalNamespaceDetector hierarchical_namespace_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} Status Init() { - service_client_ = std::make_shared( + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); + RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_.get())); return Status::OK(); } const AzureOptions& options() const { return options_; } + public: + Result GetFileInfo(const AzurePath& path) { + FileInfo info; + info.set_path(path.full_path); + + if (path.container.empty()) { + DCHECK(path.path_to_file.empty()); // The path is invalid if the container is empty + // but not path_to_file. + // path must refer to the root of the Azure storage account. This is a directory, + // and there isn't any extra metadata to fetch. + info.set_type(FileType::Directory); + return info; + } + if (path.path_to_file.empty()) { + // path refers to a container. This is a directory if it exists. + auto container_client = + blob_service_client_->GetBlobContainerClient(path.container); + try { + auto properties = container_client.GetProperties(); + info.set_type(FileType::Directory); + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + info.set_type(FileType::NotFound); + return info; + } + return internal::ExceptionToStatus( + "GetProperties for '" + container_client.GetUrl() + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the container exists.", + exception); + } + } + auto file_client = datalake_service_client_->GetFileSystemClient(path.container) + .GetFileClient(path.path_to_file); + try { + auto properties = file_client.GetProperties(); + if (properties.Value.IsDirectory) { + info.set_type(FileType::Directory); + } else if (internal::HasTrailingSlash(path.path_to_file)) { + // For a path with a trailing slash a hierarchical namespace may return a blob + // with that trailing slash removed. For consistency with flat namespace and + // other filesystems we chose to return NotFound. + info.set_type(FileType::NotFound); + return info; + } else { + info.set_type(FileType::File); + info.set_size(properties.Value.FileSize); + } + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(path.container)); + if (hierarchical_namespace_enabled) { + // If the hierarchical namespace is enabled, then the storage account will have + // explicit directories. Neither a file nor a directory was found. + info.set_type(FileType::NotFound); + return info; + } + // On flat namespace accounts there are no real directories. Directories are only + // implied by using `/` in the blob name. + Azure::Storage::Blobs::ListBlobsOptions list_blob_options; + + // If listing the prefix `path.path_to_file` with trailing slash returns at least + // one result then `path` refers to an implied directory. + auto prefix = internal::EnsureTrailingSlash(path.path_to_file); + list_blob_options.Prefix = prefix; + // We only need to know if there is at least one result, so minimise page size + // for efficiency. + list_blob_options.PageSizeHint = 1; + + try { + auto paged_list_result = + blob_service_client_->GetBlobContainerClient(path.container) + .ListBlobs(list_blob_options); + if (paged_list_result.Blobs.size() > 0) { + info.set_type(FileType::Directory); + } else { + info.set_type(FileType::NotFound); + } + return info; + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "ListBlobs for '" + prefix + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the path should be considered an implied directory.", + exception); + } + } + return internal::ExceptionToStatus( + "GetProperties for '" + file_client.GetUrl() + + "' failed with an unexpected " + "Azure error. GetFileInfo is unable to determine whether the path exists.", + exception); + } + } + Result> OpenInputFile(const std::string& s, AzureFileSystem* fs) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = @@ -494,7 +603,7 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = std::make_shared(blob_client, fs->io_context(), @@ -518,7 +627,8 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { } Result AzureFileSystem::GetFileInfo(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path)); + return impl_->GetFileInfo(p); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc new file mode 100644 index 0000000000000..3e545d670cb04 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/azurefs_internal.h" + +#include + +#include "arrow/result.h" + +namespace arrow::fs::internal { + +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + +Status HierarchicalNamespaceDetector::Init( + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client) { + datalake_service_client_ = datalake_service_client; + return Status::OK(); +} + +Result HierarchicalNamespaceDetector::Enabled(const std::string& container_name) { + // Hierarchical namespace can't easily be changed after the storage account is created + // and its common across all containers in the storage account. Do nothing until we've + // checked for a cached result. + if (enabled_.has_value()) { + return enabled_.value(); + } + + // This approach is inspired by hadoop-azure + // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. + // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly + // elevated permissions. + // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization + auto filesystem_client = datalake_service_client_->GetFileSystemClient(container_name); + auto directory_client = filesystem_client.GetDirectoryClient("/"); + try { + directory_client.GetAccessControlList(); + enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + // GetAccessControlList will fail on storage accounts without hierarchical + // namespace enabled. + + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || + exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { + // Flat namespace storage accounts with soft delete enabled return + // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete + // otherwise it returns: BadRequest - This operation is only supported on a + // hierarchical namespace account. + enabled_ = false; + } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + // Azurite returns NotFound. + try { + filesystem_client.GetProperties(); + enabled_ = false; + } catch (const Azure::Storage::StorageException& exception) { + return ExceptionToStatus("Failed to confirm '" + filesystem_client.GetUrl() + + "' is an accessible container. Therefore the " + "hierarchical namespace check was invalid.", + exception); + } + } else { + return ExceptionToStatus( + "GetAccessControlList for '" + directory_client.GetUrl() + + "' failed with an unexpected Azure error, while checking " + "whether the storage account has hierarchical namespace enabled.", + exception); + } + } + return enabled_.value(); +} + +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h new file mode 100644 index 0000000000000..c3da96239a18f --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "arrow/result.h" + +namespace arrow::fs::internal { + +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception); + +class HierarchicalNamespaceDetector { + public: + Status Init( + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client); + Result Enabled(const std::string& container_name); + + private: + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client_; + std::optional enabled_; +}; + +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f57fc4d8140a0..c08a4b50b77a8 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,6 +34,7 @@ #include #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -46,7 +47,10 @@ #include #include #include +#include +#include "arrow/filesystem/test_util.h" +#include "arrow/result.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/io_util.h" @@ -139,34 +143,37 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } -class TestAzureFileSystem : public ::testing::Test { +class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr + datalake_service_client_; + AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; + bool suite_skipped_ = false; - TestAzureFileSystem() : generator_(std::random_device()()) {} + AzureFileSystemTest() : generator_(std::random_device()()) {} - AzureOptions MakeOptions() { - const std::string& account_name = GetAzuriteEnv()->account_name(); - const std::string& account_key = GetAzuriteEnv()->account_key(); - AzureOptions options; - options.backend = AzureBackend::Azurite; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; - } + virtual Result MakeOptions() = 0; void SetUp() override { - ASSERT_THAT(GetAzuriteEnv(), NotNull()); - ASSERT_OK(GetAzuriteEnv()->status()); - - container_name_ = RandomChars(32); auto options = MakeOptions(); - service_client_ = std::make_shared( - options.account_blob_url, options.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options)); - auto container_client = service_client_->GetBlobContainerClient(container_name_); + if (options.ok()) { + options_ = *options; + } else { + suite_skipped_ = true; + GTEST_SKIP() << options.status().message(); + } + container_name_ = RandomChars(32); + blob_service_client_ = std::make_unique( + options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); @@ -175,10 +182,13 @@ class TestAzureFileSystem : public ::testing::Test { } void TearDown() override { - auto containers = service_client_->ListBlobContainers(); - for (auto container : containers.BlobContainers) { - auto container_client = service_client_->GetBlobContainerClient(container.Name); - container_client.DeleteIfExists(); + if (!suite_skipped_) { + auto containers = blob_service_client_->ListBlobContainers(); + for (auto container : containers.BlobContainers) { + auto container_client = + blob_service_client_->GetBlobContainerClient(container.Name); + container_client.DeleteIfExists(); + } } } @@ -218,15 +228,175 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); blob_client.UploadFrom(reinterpret_cast(all_lines.data()), total_size); } + + void RunGetFileInfoObjectWithNestedStructureTest(); + void RunGetFileInfoObjectTest(); +}; + +class AzuriteFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() { + EXPECT_THAT(GetAzuriteEnv(), NotNull()); + ARROW_EXPECT_OK(GetAzuriteEnv()->status()); + AzureOptions options; + options.backend = AzureBackend::Azurite; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); + return options; + } +}; + +class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() override { + AzureOptions options; + const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + return options; + } + return Status::Cancelled( + "Connection details not provided for a real flat namespace " + "account."); + } }; -TEST_F(TestAzureFileSystem, OpenInputStreamString) { +class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() override { + AzureOptions options; + const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + return options; + } + return Status::Cancelled( + "Connection details not provided for a real hierarchical namespace " + "account."); + } +}; + +TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { + AssertFileInfo(fs_.get(), "", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { + AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); + + AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); +} + +void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { + // Adds detailed tests to handle cases of different edge cases + // with directory naming conventions (e.g. with and without slashes). + constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(kObjectName) + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + // 0 is immediately after "/" lexicographically, ensure that this doesn't + // cause unexpected issues. + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient("test-object-dir/some_other_dir0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(std::string(kObjectName) + "0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", + FileType::Directory); + AssertFileInfo(fs_.get(), + PreexistingContainerPath() + "test-object-dir/some_other_dir/", + FileType::Directory); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", + FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); + datalake_service_client_->GetFileSystemClient(PreexistingContainerName()) + .GetDirectoryClient("test-empty-object-dir") + .Create(); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", + FileType::Directory); +} + +void AzureFileSystemTest::RunGetFileInfoObjectTest() { + auto object_properties = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlobClient(PreexistingObjectName()) + .GetProperties() + .Value; + + AssertFileInfo(fs_.get(), PreexistingObjectPath(), FileType::File, + std::chrono::system_clock::time_point(object_properties.LastModified), + static_cast(object_properties.BlobSize)); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { + RunGetFileInfoObjectTest(); +} + +TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -234,7 +404,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamString) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamStringBuffers) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -248,10 +418,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { EXPECT_EQ(contents, kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfo) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); @@ -260,10 +428,10 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); @@ -274,27 +442,23 @@ TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { EXPECT_EQ(size, 0); } -TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputStream(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingBucketPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(PreexistingContainerPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } -TEST_F(TestAzureFileSystem, OpenInputStreamUri) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) { ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamTrailingSlash) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) { ASSERT_RAISES(IOError, fs_->OpenInputStream(PreexistingObjectPath() + '/')); } @@ -321,7 +485,8 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (internal::StartsWith(value, "\"") && internal::EndsWith(value, "\"")) { + if (arrow::internal::StartsWith(value, "\"") && + arrow::internal::EndsWith(value, "\"")) { // Valid value value = "\"ETagValue\""; } @@ -332,7 +497,7 @@ std::shared_ptr NormalizerKeyValueMetadata( } }; // namespace -TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamReadMetadata) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -362,7 +527,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { NormalizerKeyValueMetadata(actual)->ToString()); } -TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; @@ -371,7 +536,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } -TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { +TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -417,7 +582,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { } } -TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { +TEST_F(AzuriteFileSystemTest, OpenInputFileRandomSeek) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -445,14 +610,15 @@ TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { } } -TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { +TEST_F(AzuriteFileSystemTest, OpenInputFileIoContext) { // Create a test file. const auto path_to_file = "OpenInputFileIoContext/object-name"; const auto path = PreexistingContainerPath() + path_to_file; const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); @@ -461,10 +627,8 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); } -TEST_F(TestAzureFileSystem, OpenInputFileInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputFileInfo) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); @@ -478,23 +642,19 @@ TEST_F(TestAzureFileSystem, OpenInputFileInfo) { EXPECT_EQ(std::string(buffer.data(), size), expected); } -TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputFileNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingContainerPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputFileInfoInvalid) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(NotFoundObjectPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); } -TEST_F(TestAzureFileSystem, OpenInputFileClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputFileClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 90af3c66ff8d4..46ea436a9f31a 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -191,18 +191,15 @@ std::string_view RemoveLeadingSlash(std::string_view key) { } Status AssertNoTrailingSlash(std::string_view key) { - if (key.back() == '/') { + if (HasTrailingSlash(key)) { return NotAFile(key); } return Status::OK(); } -bool HasLeadingSlash(std::string_view key) { - if (key.front() != '/') { - return false; - } - return true; -} +bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } + +bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 13a74b7fa12c8..2c8c123e779f4 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -94,6 +94,9 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); +ARROW_EXPORT +bool HasTrailingSlash(std::string_view s); + ARROW_EXPORT bool HasLeadingSlash(std::string_view s);