diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 9a6117011535e..101b089ba837f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -502,8 +502,8 @@ if(ARROW_FILESYSTEM) filesystem/util_internal.cc) if(ARROW_AZURE) - list(APPEND ARROW_SRCS filesystem/azurefs.cc) - set_source_files_properties(filesystem/azurefs.cc + list(APPEND ARROW_SRCS filesystem/azurefs.cc filesystem/azurefs_internal.cc) + set_source_files_properties(filesystem/azurefs.cc filesystem/azurefs_internal.cc PROPERTIES SKIP_PRECOMPILE_HEADERS ON SKIP_UNITY_BUILD_INCLUSION ON) endif() diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index a04338d999d11..6359183d90bb4 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -16,8 +16,10 @@ // under the License. #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include +#include #include "arrow/buffer.h" #include "arrow/filesystem/path_util.h" @@ -59,6 +61,7 @@ Status AzureOptions::ConfigureAccountKeyCredentials(const std::string& account_n credentials_kind = AzureCredentialsKind::StorageCredentials; return Status::OK(); } + namespace { // An AzureFileSystem represents a single Azure storage account. AzurePath describes a @@ -79,18 +82,17 @@ struct AzurePath { "Expected an Azure object path of the form 'container/path...', got a URI: '", s, "'"); } - const auto src = internal::RemoveTrailingSlash(s); - auto first_sep = src.find_first_of(internal::kSep); + auto first_sep = s.find_first_of(internal::kSep); if (first_sep == 0) { return Status::Invalid("Path cannot start with a separator ('", s, "')"); } if (first_sep == std::string::npos) { - return AzurePath{std::string(src), std::string(src), "", {}}; + return AzurePath{std::string(s), std::string(s), "", {}}; } AzurePath path; - path.full_path = std::string(src); - path.container = std::string(src.substr(0, first_sep)); - path.path_to_file = std::string(src.substr(first_sep + 1)); + path.full_path = std::string(s); + path.container = std::string(s.substr(0, first_sep)); + path.path_to_file = std::string(s.substr(first_sep + 1)); path.path_to_file_parts = internal::SplitAbstractPath(path.path_to_file); RETURN_NOT_OK(Validate(path)); return path; @@ -146,11 +148,6 @@ Status ValidateFilePath(const AzurePath& path) { return Status::OK(); } -Status ErrorToStatus(const std::string& prefix, - const Azure::Storage::StorageException& exception) { - return Status::IOError(prefix, " Azure Error: ", exception.what()); -} - template std::string FormatValue(typename TypeTraits::CType value) { struct StringAppender { @@ -316,11 +313,13 @@ class ObjectInputFile final : public io::RandomAccessFile { return Status::OK(); } catch (const Azure::Storage::StorageException& exception) { if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { - // Could be either container or blob not found. return PathNotFound(path_); } - return ErrorToStatus( - "When fetching properties for '" + blob_client_->GetUrl() + "': ", exception); + return internal::ExceptionToStatus( + "GetProperties failed for '" + blob_client_->GetUrl() + + "' with an unexpected Azure error. Can not initialise an ObjectInputFile " + "without knowing the file size.", + exception); } } @@ -397,10 +396,12 @@ class ObjectInputFile final : public io::RandomAccessFile { ->DownloadTo(reinterpret_cast(out), nbytes, download_options) .Value.ContentRange.Length.Value(); } catch (const Azure::Storage::StorageException& exception) { - return ErrorToStatus("When reading from '" + blob_client_->GetUrl() + - "' at position " + std::to_string(position) + " for " + - std::to_string(nbytes) + " bytes: ", - exception); + return internal::ExceptionToStatus("DownloadTo from '" + blob_client_->GetUrl() + + "' at position " + std::to_string(position) + + " for " + std::to_string(nbytes) + + " bytes failed with an Azure error. ReadAt " + "failed to read the required byte range.", + exception); } } @@ -444,7 +445,6 @@ class ObjectInputFile final : public io::RandomAccessFile { int64_t content_length_ = kNoSize; std::shared_ptr metadata_; }; - } // namespace // ----------------------------------------------------------------------- @@ -453,27 +453,136 @@ class ObjectInputFile final : public io::RandomAccessFile { class AzureFileSystem::Impl { public: io::IOContext io_context_; - std::shared_ptr service_client_; + std::unique_ptr + datalake_service_client_; + std::unique_ptr blob_service_client_; AzureOptions options_; + internal::HierarchicalNamespaceDetector hierarchical_namespace_; explicit Impl(AzureOptions options, io::IOContext io_context) : io_context_(io_context), options_(std::move(options)) {} Status Init() { - service_client_ = std::make_shared( + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); + RETURN_NOT_OK(hierarchical_namespace_.Init(datalake_service_client_.get())); return Status::OK(); } const AzureOptions& options() const { return options_; } + public: + Result GetFileInfo(const AzurePath& path) { + FileInfo info; + info.set_path(path.full_path); + + if (path.container.empty()) { + DCHECK(path.path_to_file.empty()); // The path is invalid if the container is empty + // but not path_to_file. + // path must refer to the root of the Azure storage account. This is a directory, + // and there isn't any extra metadata to fetch. + info.set_type(FileType::Directory); + return info; + } + if (path.path_to_file.empty()) { + // path refers to a container. This is a directory if it exists. + auto container_client = + blob_service_client_->GetBlobContainerClient(path.container); + try { + auto properties = container_client.GetProperties(); + info.set_type(FileType::Directory); + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + info.set_type(FileType::NotFound); + return info; + } + return internal::ExceptionToStatus( + "GetProperties for '" + container_client.GetUrl() + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the container exists.", + exception); + } + } + auto file_client = datalake_service_client_->GetFileSystemClient(path.container) + .GetFileClient(path.path_to_file); + try { + auto properties = file_client.GetProperties(); + if (properties.Value.IsDirectory) { + info.set_type(FileType::Directory); + } else if (internal::HasTrailingSlash(path.path_to_file)) { + // For a path with a trailing slash a hierarchical namespace may return a blob + // with that trailing slash removed. For consistency with flat namespace and + // other filesystems we chose to return NotFound. + info.set_type(FileType::NotFound); + return info; + } else { + info.set_type(FileType::File); + info.set_size(properties.Value.FileSize); + } + info.set_mtime( + std::chrono::system_clock::time_point(properties.Value.LastModified)); + return info; + } catch (const Azure::Storage::StorageException& exception) { + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + ARROW_ASSIGN_OR_RAISE(auto hierarchical_namespace_enabled, + hierarchical_namespace_.Enabled(path.container)); + if (hierarchical_namespace_enabled) { + // If the hierarchical namespace is enabled, then the storage account will have + // explicit directories. Neither a file nor a directory was found. + info.set_type(FileType::NotFound); + return info; + } + // On flat namespace accounts there are no real directories. Directories are only + // implied by using `/` in the blob name. + Azure::Storage::Blobs::ListBlobsOptions list_blob_options; + + // If listing the prefix `path.path_to_file` with trailing slash returns at least + // one result then `path` refers to an implied directory. + auto prefix = internal::EnsureTrailingSlash(path.path_to_file); + list_blob_options.Prefix = prefix; + // We only need to know if there is at least one result, so minimise page size + // for efficiency. + list_blob_options.PageSizeHint = 1; + + try { + auto paged_list_result = + blob_service_client_->GetBlobContainerClient(path.container) + .ListBlobs(list_blob_options); + if (paged_list_result.Blobs.size() > 0) { + info.set_type(FileType::Directory); + } else { + info.set_type(FileType::NotFound); + } + return info; + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus( + "ListBlobs for '" + prefix + + "' failed with an unexpected Azure error. GetFileInfo is unable to " + "determine whether the path should be considered an implied directory.", + exception); + } + } + return internal::ExceptionToStatus( + "GetProperties for '" + file_client.GetUrl() + + "' failed with an unexpected " + "Azure error. GetFileInfo is unable to determine whether the path exists.", + exception); + } + } + Result> OpenInputFile(const std::string& s, AzureFileSystem* fs) { ARROW_RETURN_NOT_OK(internal::AssertNoTrailingSlash(s)); ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(s)); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = @@ -494,7 +603,7 @@ class AzureFileSystem::Impl { ARROW_ASSIGN_OR_RAISE(auto path, AzurePath::FromString(info.path())); RETURN_NOT_OK(ValidateFilePath(path)); auto blob_client = std::make_shared( - service_client_->GetBlobContainerClient(path.container) + blob_service_client_->GetBlobContainerClient(path.container) .GetBlobClient(path.path_to_file)); auto ptr = std::make_shared(blob_client, fs->io_context(), @@ -518,7 +627,8 @@ bool AzureFileSystem::Equals(const FileSystem& other) const { } Result AzureFileSystem::GetFileInfo(const std::string& path) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + ARROW_ASSIGN_OR_RAISE(auto p, AzurePath::FromString(path)); + return impl_->GetFileInfo(p); } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { diff --git a/cpp/src/arrow/filesystem/azurefs_internal.cc b/cpp/src/arrow/filesystem/azurefs_internal.cc new file mode 100644 index 0000000000000..3e545d670cb04 --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.cc @@ -0,0 +1,88 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/azurefs_internal.h" + +#include + +#include "arrow/result.h" + +namespace arrow::fs::internal { + +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception) { + return Status::IOError(prefix, " Azure Error: ", exception.what()); +} + +Status HierarchicalNamespaceDetector::Init( + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client) { + datalake_service_client_ = datalake_service_client; + return Status::OK(); +} + +Result HierarchicalNamespaceDetector::Enabled(const std::string& container_name) { + // Hierarchical namespace can't easily be changed after the storage account is created + // and its common across all containers in the storage account. Do nothing until we've + // checked for a cached result. + if (enabled_.has_value()) { + return enabled_.value(); + } + + // This approach is inspired by hadoop-azure + // https://github.com/apache/hadoop/blob/7c6af6a5f626d18d68b656d085cc23e4c1f7a1ef/hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/AzureBlobFileSystemStore.java#L356. + // Unfortunately `blob_service_client->GetAccountInfo()` requires significantly + // elevated permissions. + // https://learn.microsoft.com/en-us/rest/api/storageservices/get-blob-service-properties?tabs=azure-ad#authorization + auto filesystem_client = datalake_service_client_->GetFileSystemClient(container_name); + auto directory_client = filesystem_client.GetDirectoryClient("/"); + try { + directory_client.GetAccessControlList(); + enabled_ = true; + } catch (const Azure::Storage::StorageException& exception) { + // GetAccessControlList will fail on storage accounts without hierarchical + // namespace enabled. + + if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::BadRequest || + exception.StatusCode == Azure::Core::Http::HttpStatusCode::Conflict) { + // Flat namespace storage accounts with soft delete enabled return + // Conflict - This endpoint does not support BlobStorageEvents or SoftDelete + // otherwise it returns: BadRequest - This operation is only supported on a + // hierarchical namespace account. + enabled_ = false; + } else if (exception.StatusCode == Azure::Core::Http::HttpStatusCode::NotFound) { + // Azurite returns NotFound. + try { + filesystem_client.GetProperties(); + enabled_ = false; + } catch (const Azure::Storage::StorageException& exception) { + return ExceptionToStatus("Failed to confirm '" + filesystem_client.GetUrl() + + "' is an accessible container. Therefore the " + "hierarchical namespace check was invalid.", + exception); + } + } else { + return ExceptionToStatus( + "GetAccessControlList for '" + directory_client.GetUrl() + + "' failed with an unexpected Azure error, while checking " + "whether the storage account has hierarchical namespace enabled.", + exception); + } + } + return enabled_.value(); +} + +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_internal.h b/cpp/src/arrow/filesystem/azurefs_internal.h new file mode 100644 index 0000000000000..c3da96239a18f --- /dev/null +++ b/cpp/src/arrow/filesystem/azurefs_internal.h @@ -0,0 +1,42 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "arrow/result.h" + +namespace arrow::fs::internal { + +Status ExceptionToStatus(const std::string& prefix, + const Azure::Storage::StorageException& exception); + +class HierarchicalNamespaceDetector { + public: + Status Init( + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client); + Result Enabled(const std::string& container_name); + + private: + Azure::Storage::Files::DataLake::DataLakeServiceClient* datalake_service_client_; + std::optional enabled_; +}; + +} // namespace arrow::fs::internal diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index f57fc4d8140a0..c08a4b50b77a8 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -34,6 +34,7 @@ #include #include "arrow/filesystem/azurefs.h" +#include "arrow/filesystem/azurefs_internal.h" #include #include @@ -46,7 +47,10 @@ #include #include #include +#include +#include "arrow/filesystem/test_util.h" +#include "arrow/result.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/util/io_util.h" @@ -139,34 +143,37 @@ TEST(AzureFileSystem, OptionsCompare) { EXPECT_TRUE(options.Equals(options)); } -class TestAzureFileSystem : public ::testing::Test { +class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::shared_ptr service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr + datalake_service_client_; + AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; + bool suite_skipped_ = false; - TestAzureFileSystem() : generator_(std::random_device()()) {} + AzureFileSystemTest() : generator_(std::random_device()()) {} - AzureOptions MakeOptions() { - const std::string& account_name = GetAzuriteEnv()->account_name(); - const std::string& account_key = GetAzuriteEnv()->account_key(); - AzureOptions options; - options.backend = AzureBackend::Azurite; - ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); - return options; - } + virtual Result MakeOptions() = 0; void SetUp() override { - ASSERT_THAT(GetAzuriteEnv(), NotNull()); - ASSERT_OK(GetAzuriteEnv()->status()); - - container_name_ = RandomChars(32); auto options = MakeOptions(); - service_client_ = std::make_shared( - options.account_blob_url, options.storage_credentials_provider); - ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options)); - auto container_client = service_client_->GetBlobContainerClient(container_name_); + if (options.ok()) { + options_ = *options; + } else { + suite_skipped_ = true; + GTEST_SKIP() << options.status().message(); + } + container_name_ = RandomChars(32); + blob_service_client_ = std::make_unique( + options_.account_blob_url, options_.storage_credentials_provider); + datalake_service_client_ = + std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); + ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); + auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); container_client.CreateIfNotExists(); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); @@ -175,10 +182,13 @@ class TestAzureFileSystem : public ::testing::Test { } void TearDown() override { - auto containers = service_client_->ListBlobContainers(); - for (auto container : containers.BlobContainers) { - auto container_client = service_client_->GetBlobContainerClient(container.Name); - container_client.DeleteIfExists(); + if (!suite_skipped_) { + auto containers = blob_service_client_->ListBlobContainers(); + for (auto container : containers.BlobContainers) { + auto container_client = + blob_service_client_->GetBlobContainerClient(container.Name); + container_client.DeleteIfExists(); + } } } @@ -218,15 +228,175 @@ class TestAzureFileSystem : public ::testing::Test { void UploadLines(const std::vector& lines, const char* path_to_file, int total_size) { // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); std::string all_lines = std::accumulate(lines.begin(), lines.end(), std::string("")); blob_client.UploadFrom(reinterpret_cast(all_lines.data()), total_size); } + + void RunGetFileInfoObjectWithNestedStructureTest(); + void RunGetFileInfoObjectTest(); +}; + +class AzuriteFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() { + EXPECT_THAT(GetAzuriteEnv(), NotNull()); + ARROW_EXPECT_OK(GetAzuriteEnv()->status()); + AzureOptions options; + options.backend = AzureBackend::Azurite; + ARROW_EXPECT_OK(options.ConfigureAccountKeyCredentials( + GetAzuriteEnv()->account_name(), GetAzuriteEnv()->account_key())); + return options; + } +}; + +class AzureFlatNamespaceFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() override { + AzureOptions options; + const auto account_key = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_FLAT_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + return options; + } + return Status::Cancelled( + "Connection details not provided for a real flat namespace " + "account."); + } }; -TEST_F(TestAzureFileSystem, OpenInputStreamString) { +class AzureHierarchicalNamespaceFileSystemTest : public AzureFileSystemTest { + Result MakeOptions() override { + AzureOptions options; + const auto account_key = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_KEY"); + const auto account_name = std::getenv("AZURE_HIERARCHICAL_NAMESPACE_ACCOUNT_NAME"); + if (account_key && account_name) { + RETURN_NOT_OK(options.ConfigureAccountKeyCredentials(account_name, account_key)); + return options; + } + return Status::Cancelled( + "Connection details not provided for a real hierarchical namespace " + "account."); + } +}; + +TEST_F(AzureFlatNamespaceFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(true, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespace) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_OK_AND_EQ(false, hierarchical_namespace.Enabled(PreexistingContainerName())); +} + +TEST_F(AzuriteFileSystemTest, DetectHierarchicalNamespaceFailsWithMissingContainer) { + auto hierarchical_namespace = internal::HierarchicalNamespaceDetector(); + ASSERT_OK(hierarchical_namespace.Init(datalake_service_client_.get())); + ASSERT_NOT_OK(hierarchical_namespace.Enabled("non-existent-container")); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoAccount) { + AssertFileInfo(fs_.get(), "", FileType::Directory); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://")); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoContainer) { + AssertFileInfo(fs_.get(), PreexistingContainerName(), FileType::Directory); + + AssertFileInfo(fs_.get(), "non-existent-container", FileType::NotFound); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingContainerName())); +} + +void AzureFileSystemTest::RunGetFileInfoObjectWithNestedStructureTest() { + // Adds detailed tests to handle cases of different edge cases + // with directory naming conventions (e.g. with and without slashes). + constexpr auto kObjectName = "test-object-dir/some_other_dir/another_dir/foo"; + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(kObjectName) + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + // 0 is immediately after "/" lexicographically, ensure that this doesn't + // cause unexpected issues. + // TODO(GH-38333): Switch to using Azure filesystem to write once its implemented. + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient("test-object-dir/some_other_dir0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(std::string(kObjectName) + "0") + .UploadFrom(reinterpret_cast(kLoremIpsum), strlen(kLoremIpsum)); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName, FileType::File); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + kObjectName + "/", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/", + FileType::Directory); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_dir", + FileType::Directory); + AssertFileInfo(fs_.get(), + PreexistingContainerPath() + "test-object-dir/some_other_dir/", + FileType::Directory); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-di", + FileType::NotFound); + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-object-dir/some_other_di", + FileType::NotFound); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); +} + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObjectWithNestedStructure) { + RunGetFileInfoObjectWithNestedStructureTest(); + datalake_service_client_->GetFileSystemClient(PreexistingContainerName()) + .GetDirectoryClient("test-empty-object-dir") + .Create(); + + AssertFileInfo(fs_.get(), PreexistingContainerPath() + "test-empty-object-dir", + FileType::Directory); +} + +void AzureFileSystemTest::RunGetFileInfoObjectTest() { + auto object_properties = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlobClient(PreexistingObjectName()) + .GetProperties() + .Value; + + AssertFileInfo(fs_.get(), PreexistingObjectPath(), FileType::File, + std::chrono::system_clock::time_point(object_properties.LastModified), + static_cast(object_properties.BlobSize)); + + // URI + ASSERT_RAISES(Invalid, fs_->GetFileInfo("abfs://" + PreexistingObjectName())); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } + +TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { + RunGetFileInfoObjectTest(); +} + +TEST_F(AzuriteFileSystemTest, OpenInputStreamString) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -234,7 +404,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamString) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamStringBuffers) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -248,10 +418,8 @@ TEST_F(TestAzureFileSystem, OpenInputStreamStringBuffers) { EXPECT_EQ(contents, kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfo) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(info)); @@ -260,10 +428,10 @@ TEST_F(TestAzureFileSystem, OpenInputStreamInfo) { EXPECT_EQ(buffer->ToString(), kLoremIpsum); } -TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamEmpty) { const auto path_to_file = "empty-object.txt"; const auto path = PreexistingContainerPath() + path_to_file; - service_client_->GetBlobContainerClient(PreexistingContainerName()) + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) .GetBlockBlobClient(path_to_file) .UploadFrom(nullptr, 0); @@ -274,27 +442,23 @@ TEST_F(TestAzureFileSystem, OpenInputStreamEmpty) { EXPECT_EQ(size, 0); } -TEST_F(TestAzureFileSystem, OpenInputStreamNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputStream(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingBucketPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::Directory); +TEST_F(AzuriteFileSystemTest, OpenInputStreamInfoInvalid) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(PreexistingContainerPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputStream(info2)); } -TEST_F(TestAzureFileSystem, OpenInputStreamUri) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamUri) { ASSERT_RAISES(Invalid, fs_->OpenInputStream("abfss://" + PreexistingObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputStreamTrailingSlash) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamTrailingSlash) { ASSERT_RAISES(IOError, fs_->OpenInputStream(PreexistingObjectPath() + '/')); } @@ -321,7 +485,8 @@ std::shared_ptr NormalizerKeyValueMetadata( value = "2023-10-31T08:15:20Z"; } } else if (key == "ETag") { - if (internal::StartsWith(value, "\"") && internal::EndsWith(value, "\"")) { + if (arrow::internal::StartsWith(value, "\"") && + arrow::internal::EndsWith(value, "\"")) { // Valid value value = "\"ETagValue\""; } @@ -332,7 +497,7 @@ std::shared_ptr NormalizerKeyValueMetadata( } }; // namespace -TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamReadMetadata) { std::shared_ptr stream; ASSERT_OK_AND_ASSIGN(stream, fs_->OpenInputStream(PreexistingObjectPath())); @@ -362,7 +527,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamReadMetadata) { NormalizerKeyValueMetadata(actual)->ToString()); } -TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputStreamClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputStream(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; @@ -371,7 +536,7 @@ TEST_F(TestAzureFileSystem, OpenInputStreamClosed) { ASSERT_RAISES(Invalid, stream->Tell()); } -TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { +TEST_F(AzuriteFileSystemTest, OpenInputFileMixedReadVsReadAt) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -417,7 +582,7 @@ TEST_F(TestAzureFileSystem, OpenInputFileMixedReadVsReadAt) { } } -TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { +TEST_F(AzuriteFileSystemTest, OpenInputFileRandomSeek) { // Create a file large enough to make the random access tests non-trivial. auto constexpr kLineWidth = 100; auto constexpr kLineCount = 4096; @@ -445,14 +610,15 @@ TEST_F(TestAzureFileSystem, OpenInputFileRandomSeek) { } } -TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { +TEST_F(AzuriteFileSystemTest, OpenInputFileIoContext) { // Create a test file. const auto path_to_file = "OpenInputFileIoContext/object-name"; const auto path = PreexistingContainerPath() + path_to_file; const std::string contents = "The quick brown fox jumps over the lazy dog"; - auto blob_client = service_client_->GetBlobContainerClient(PreexistingContainerName()) - .GetBlockBlobClient(path_to_file); + auto blob_client = + blob_service_client_->GetBlobContainerClient(PreexistingContainerName()) + .GetBlockBlobClient(path_to_file); blob_client.UploadFrom(reinterpret_cast(contents.data()), contents.length()); @@ -461,10 +627,8 @@ TEST_F(TestAzureFileSystem, OpenInputFileIoContext) { EXPECT_EQ(fs_->io_context().external_id(), file->io_context().external_id()); } -TEST_F(TestAzureFileSystem, OpenInputFileInfo) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingObjectPath())); - arrow::fs::FileInfo info(PreexistingObjectPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputFileInfo) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingObjectPath())); std::shared_ptr file; ASSERT_OK_AND_ASSIGN(file, fs_->OpenInputFile(info)); @@ -478,23 +642,19 @@ TEST_F(TestAzureFileSystem, OpenInputFileInfo) { EXPECT_EQ(std::string(buffer.data(), size), expected); } -TEST_F(TestAzureFileSystem, OpenInputFileNotFound) { +TEST_F(AzuriteFileSystemTest, OpenInputFileNotFound) { ASSERT_RAISES(IOError, fs_->OpenInputFile(NotFoundObjectPath())); } -TEST_F(TestAzureFileSystem, OpenInputFileInfoInvalid) { - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(PreexistingContainerPath())); - arrow::fs::FileInfo info(PreexistingContainerPath(), FileType::File); +TEST_F(AzuriteFileSystemTest, OpenInputFileInfoInvalid) { + ASSERT_OK_AND_ASSIGN(auto info, fs_->GetFileInfo(PreexistingContainerPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info)); - // TODO(GH-38335): When implemented use ASSERT_OK_AND_ASSIGN(info, - // fs->GetFileInfo(NotFoundObjectPath())); - arrow::fs::FileInfo info2(NotFoundObjectPath(), FileType::NotFound); + ASSERT_OK_AND_ASSIGN(auto info2, fs_->GetFileInfo(NotFoundObjectPath())); ASSERT_RAISES(IOError, fs_->OpenInputFile(info2)); } -TEST_F(TestAzureFileSystem, OpenInputFileClosed) { +TEST_F(AzuriteFileSystemTest, OpenInputFileClosed) { ASSERT_OK_AND_ASSIGN(auto stream, fs_->OpenInputFile(PreexistingObjectPath())); ASSERT_OK(stream->Close()); std::array buffer{}; diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 90af3c66ff8d4..46ea436a9f31a 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -191,18 +191,15 @@ std::string_view RemoveLeadingSlash(std::string_view key) { } Status AssertNoTrailingSlash(std::string_view key) { - if (key.back() == '/') { + if (HasTrailingSlash(key)) { return NotAFile(key); } return Status::OK(); } -bool HasLeadingSlash(std::string_view key) { - if (key.front() != '/') { - return false; - } - return true; -} +bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } + +bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 13a74b7fa12c8..2c8c123e779f4 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -94,6 +94,9 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); +ARROW_EXPORT +bool HasTrailingSlash(std::string_view s); + ARROW_EXPORT bool HasLeadingSlash(std::string_view s);