diff --git a/cpp/src/arrow/filesystem/azurefs.cc b/cpp/src/arrow/filesystem/azurefs.cc index 9bd2b0ae9d8a0..daababb04c172 100644 --- a/cpp/src/arrow/filesystem/azurefs.cc +++ b/cpp/src/arrow/filesystem/azurefs.cc @@ -39,7 +39,7 @@ namespace fs { // ----------------------------------------------------------------------- // AzureOptions Implementation -AzureOptions::AzureOptions() {} +AzureOptions::AzureOptions() = default; bool AzureOptions::Equals(const AzureOptions& other) const { return (account_dfs_url == other.account_dfs_url && @@ -820,6 +820,209 @@ class AzureFileSystem::Impl { } } + private: + template + Status VisitContainers(const Azure::Core::Context& context, + OnContainer&& on_container) const { + Azure::Storage::Blobs::ListBlobContainersOptions options; + try { + auto container_list_response = + blob_service_client_->ListBlobContainers(options, context); + for (; container_list_response.HasPage(); + container_list_response.MoveToNextPage(context)) { + for (const auto& container : container_list_response.BlobContainers) { + RETURN_NOT_OK(on_container(container)); + } + } + } catch (const Azure::Storage::StorageException& exception) { + return internal::ExceptionToStatus("Failed to list account containers.", exception); + } + return Status::OK(); + } + + static FileInfo FileInfoFromBlob(const std::string& container, + const Azure::Storage::Blobs::Models::BlobItem& blob) { + auto path = internal::ConcatAbstractPath(container, blob.Name); + if (internal::HasTrailingSlash(blob.Name)) { + return DirectoryFileInfoFromPath(path); + } + FileInfo info{std::move(path), FileType::File}; + info.set_size(blob.BlobSize); + info.set_mtime(std::chrono::system_clock::time_point{blob.Details.LastModified}); + return info; + } + + static FileInfo DirectoryFileInfoFromPath(const std::string& path) { + return FileInfo{std::string{internal::RemoveTrailingSlash(path)}, + FileType::Directory}; + } + + static std::string_view BasenameView(std::string_view s) { + DCHECK(!internal::HasTrailingSlash(s)); + auto offset = s.find_last_of(internal::kSep); + auto result = (offset == std::string_view::npos) ? s : s.substr(offset); + DCHECK(!result.empty() && result.back() != internal::kSep); + return result; + } + + /// \brief List the blobs at the root of a container or some dir in a container. + /// + /// \pre container_client is the client for the container named like the first + /// segment of select.base_dir. + Status GetFileInfoWithSelectorFromContainer( + const Azure::Storage::Blobs::BlobContainerClient& container_client, + const Azure::Core::Context& context, Azure::Nullable page_size_hint, + const FileSelector& select, FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + bool found = false; + Azure::Storage::Blobs::ListBlobsOptions options; + if (internal::IsEmptyPath(base_location.path)) { + // If the base_dir is the root of the container, then we want to list all blobs in + // the container and the Prefix should be empty and not even include the trailing + // slash because the container itself represents the `/` directory. + options.Prefix = {}; + found = true; // Unless the container itself is not found later! + } else { + options.Prefix = internal::EnsureTrailingSlash(base_location.path); + } + options.PageSizeHint = page_size_hint; + options.Include = Azure::Storage::Blobs::Models::ListBlobsIncludeFlags::Metadata; + + auto recurse = [&](const std::string& blob_prefix) noexcept -> Status { + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = internal::ConcatAbstractPath( + base_location.container, internal::RemoveTrailingSlash(blob_prefix)); + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + return GetFileInfoWithSelectorFromContainer( + container_client, context, page_size_hint, sub_select, acc_results); + } + return Status::OK(); + }; + + auto process_blob = + [&](const Azure::Storage::Blobs::Models::BlobItem& blob) noexcept { + // blob.Name has trailing slash only when Prefix is an empty + // directory marker blob for the directory we're listing + // from, and we should skip it. + if (!internal::HasTrailingSlash(blob.Name)) { + acc_results->push_back(FileInfoFromBlob(base_location.container, blob)); + } + }; + auto process_prefix = [&](const std::string& prefix) noexcept -> Status { + const auto path = internal::ConcatAbstractPath(base_location.container, prefix); + acc_results->push_back(DirectoryFileInfoFromPath(path)); + return recurse(prefix); + }; + + try { + auto list_response = + container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options, context); + for (; list_response.HasPage(); list_response.MoveToNextPage(context)) { + if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty()) { + continue; + } + found = true; + // Blob and BlobPrefixes are sorted by name, so we can merge-iterate + // them to ensure returned results are all sorted. + size_t blob_index = 0; + size_t blob_prefix_index = 0; + while (blob_index < list_response.Blobs.size() && + blob_prefix_index < list_response.BlobPrefixes.size()) { + const auto& blob = list_response.Blobs[blob_index]; + const auto& prefix = list_response.BlobPrefixes[blob_prefix_index]; + const int cmp = blob.Name.compare(prefix); + if (cmp < 0) { + process_blob(blob); + blob_index += 1; + } else if (cmp > 0) { + RETURN_NOT_OK(process_prefix(prefix)); + blob_prefix_index += 1; + } else { + DCHECK_EQ(blob.Name, prefix); + RETURN_NOT_OK(process_prefix(prefix)); + blob_index += 1; + blob_prefix_index += 1; + // If the container has an empty dir marker blob and another blob starting + // with this blob name as a prefix, the blob doesn't appear in the listing + // that also contains the prefix, so AFAICT this branch in unreachable. The + // code above is kept just in case, but if this DCHECK(false) is ever reached, + // we should refactor this loop to ensure no duplicate entries are ever + // reported. + DCHECK(false) + << "Unexpected blob/prefix name collision on the same listing request"; + } + } + for (; blob_index < list_response.Blobs.size(); blob_index++) { + process_blob(list_response.Blobs[blob_index]); + } + for (; blob_prefix_index < list_response.BlobPrefixes.size(); + blob_prefix_index++) { + RETURN_NOT_OK(process_prefix(list_response.BlobPrefixes[blob_prefix_index])); + } + } + } catch (const Azure::Storage::StorageException& exception) { + if (exception.ErrorCode == "ContainerNotFound") { + found = false; + } else { + return internal::ExceptionToStatus( + "Failed to list blobs in a directory: " + select.base_dir + ": " + + container_client.GetUrl(), + exception); + } + } + + return found || select.allow_not_found + ? Status::OK() + : ::arrow::fs::internal::PathNotFound(select.base_dir); + } + + public: + Status GetFileInfoWithSelector(const Azure::Core::Context& context, + Azure::Nullable page_size_hint, + const FileSelector& select, + FileInfoVector* acc_results) { + ARROW_ASSIGN_OR_RAISE(auto base_location, AzureLocation::FromString(select.base_dir)); + + if (base_location.container.empty()) { + // Without a container, the base_location is equivalent to the filesystem + // root -- `/`. FileSelector::allow_not_found doesn't matter in this case + // because the root always exists. + auto on_container = + [&](const Azure::Storage::Blobs::Models::BlobContainerItem& container) { + // Deleted containers are not listed by ListContainers. + DCHECK(!container.IsDeleted); + + // Every container is considered a directory. + FileInfo info{container.Name, FileType::Directory}; + info.set_mtime( + std::chrono::system_clock::time_point{container.Details.LastModified}); + acc_results->push_back(std::move(info)); + + // Recurse into containers (subdirectories) if requested. + if (select.recursive && select.max_recursion > 0) { + FileSelector sub_select; + sub_select.base_dir = container.Name; + sub_select.allow_not_found = true; + sub_select.recursive = true; + sub_select.max_recursion = select.max_recursion - 1; + ARROW_RETURN_NOT_OK(GetFileInfoWithSelector(context, page_size_hint, + sub_select, acc_results)); + } + return Status::OK(); + }; + return VisitContainers(context, std::move(on_container)); + } + + auto container_client = + blob_service_client_->GetBlobContainerClient(base_location.container); + return GetFileInfoWithSelectorFromContainer(container_client, context, page_size_hint, + select, acc_results); + } + Result> OpenInputFile(const AzureLocation& location, AzureFileSystem* fs) { RETURN_NOT_OK(ValidateFileLocation(location)); @@ -1196,7 +1399,12 @@ Result AzureFileSystem::GetFileInfo(const std::string& path) { } Result AzureFileSystem::GetFileInfo(const FileSelector& select) { - return Status::NotImplemented("The Azure FileSystem is not fully implemented"); + Azure::Core::Context context; + Azure::Nullable page_size_hint; // unspecified + FileInfoVector results; + RETURN_NOT_OK( + impl_->GetFileInfoWithSelector(context, page_size_hint, select, &results)); + return {std::move(results)}; } Status AzureFileSystem::CreateDir(const std::string& path, bool recursive) { diff --git a/cpp/src/arrow/filesystem/azurefs.h b/cpp/src/arrow/filesystem/azurefs.h index 9f980ee8baae0..b2865b059ef6e 100644 --- a/cpp/src/arrow/filesystem/azurefs.h +++ b/cpp/src/arrow/filesystem/azurefs.h @@ -157,7 +157,7 @@ class ARROW_EXPORT AzureFileSystem : public FileSystem { const AzureOptions& options, const io::IOContext& = io::default_io_context()); private: - explicit AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); + AzureFileSystem(const AzureOptions& options, const io::IOContext& io_context); class Impl; std::unique_ptr impl_; diff --git a/cpp/src/arrow/filesystem/azurefs_test.cc b/cpp/src/arrow/filesystem/azurefs_test.cc index 41f1663114f45..792c63b209402 100644 --- a/cpp/src/arrow/filesystem/azurefs_test.cc +++ b/cpp/src/arrow/filesystem/azurefs_test.cc @@ -70,6 +70,9 @@ using ::testing::IsEmpty; using ::testing::Not; using ::testing::NotNull; +namespace Blobs = Azure::Storage::Blobs; +namespace Files = Azure::Storage::Files; + auto const* kLoremIpsum = R"""( Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis @@ -193,9 +196,8 @@ TEST(AzureFileSystem, OptionsCompare) { class AzureFileSystemTest : public ::testing::Test { public: std::shared_ptr fs_; - std::unique_ptr blob_service_client_; - std::unique_ptr - datalake_service_client_; + std::unique_ptr blob_service_client_; + std::unique_ptr datalake_service_client_; AzureOptions options_; std::mt19937_64 generator_; std::string container_name_; @@ -213,15 +215,14 @@ class AzureFileSystemTest : public ::testing::Test { suite_skipped_ = true; GTEST_SKIP() << options.status().message(); } - container_name_ = RandomChars(32); - blob_service_client_ = std::make_unique( + // Stop-gap solution before GH-39119 is fixed. + container_name_ = "z" + RandomChars(31); + blob_service_client_ = std::make_unique( options_.account_blob_url, options_.storage_credentials_provider); - datalake_service_client_ = - std::make_unique( - options_.account_dfs_url, options_.storage_credentials_provider); + datalake_service_client_ = std::make_unique( + options_.account_dfs_url, options_.storage_credentials_provider); ASSERT_OK_AND_ASSIGN(fs_, AzureFileSystem::Make(options_)); - auto container_client = blob_service_client_->GetBlobContainerClient(container_name_); - container_client.CreateIfNotExists(); + auto container_client = CreateContainer(container_name_); auto blob_client = container_client.GetBlockBlobClient(PreexistingObjectName()); blob_client.UploadFrom(reinterpret_cast(kLoremIpsum), @@ -239,6 +240,20 @@ class AzureFileSystemTest : public ::testing::Test { } } + Blobs::BlobContainerClient CreateContainer(const std::string& name) { + auto container_client = blob_service_client_->GetBlobContainerClient(name); + (void)container_client.CreateIfNotExists(); + return container_client; + } + + Blobs::BlobClient CreateBlob(Blobs::BlobContainerClient& container_client, + const std::string& name, const std::string& data = "") { + auto blob_client = container_client.GetBlockBlobClient(name); + (void)blob_client.UploadFrom(reinterpret_cast(data.data()), + data.size()); + return blob_client; + } + std::string PreexistingContainerName() const { return container_name_; } std::string PreexistingContainerPath() const { @@ -326,6 +341,45 @@ class AzureFileSystemTest : public ::testing::Test { top_blob_path, }; } + + char const* kSubData = "sub data"; + char const* kSomeData = "some data"; + char const* kOtherData = "other data"; + + void SetUpSmallFileSystemTree() { + // Set up test containers + CreateContainer("empty-container"); + auto container = CreateContainer("container"); + + CreateBlob(container, "emptydir/"); + CreateBlob(container, "somedir/subdir/subfile", kSubData); + CreateBlob(container, "somefile", kSomeData); + // Add an explicit marker for a non-empty directory. + CreateBlob(container, "otherdir/1/2/"); + // otherdir/{1/,2/,3/} are implicitly assumed to exist because of + // the otherdir/1/2/3/otherfile blob. + CreateBlob(container, "otherdir/1/2/3/otherfile", kOtherData); + } + + void AssertInfoAllContainersRecursive(const std::vector& infos) { + ASSERT_EQ(infos.size(), 14); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[6], "container/otherdir/1/2/3/otherfile", FileType::File, + strlen(kOtherData)); + AssertFileInfo(infos[7], "container/somedir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[9], "container/somedir/subdir/subfile", FileType::File, + strlen(kSubData)); + AssertFileInfo(infos[10], "container/somefile", FileType::File, strlen(kSomeData)); + AssertFileInfo(infos[11], "empty-container", FileType::Directory); + AssertFileInfo(infos[12], PreexistingContainerName(), FileType::Directory); + AssertFileInfo(infos[13], PreexistingObjectPath(), FileType::File); + } }; class AzuriteFileSystemTest : public AzureFileSystemTest { @@ -518,6 +572,180 @@ TEST_F(AzureHierarchicalNamespaceFileSystemTest, GetFileInfoObject) { RunGetFileInfoObjectTest(); } +TEST_F(AzuriteFileSystemTest, GetFileInfoSelector) { + SetUpSmallFileSystemTree(); + + FileSelector select; + std::vector infos; + + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 3); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container", FileType::Directory); + AssertFileInfo(infos[1], "empty-container", FileType::Directory); + AssertFileInfo(infos[2], container_name_, FileType::Directory); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Nonexistent container + select.base_dir = "nonexistent-container"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/somedir", FileType::Directory); + AssertFileInfo(infos[3], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + select.base_dir = "container/somedir/subdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/somedir/subdir/subfile", FileType::File, 8); + // Nonexistent + select.base_dir = "container/nonexistent"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.allow_not_found = true; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.allow_not_found = false; + + // Trailing slashes + select.base_dir = "empty-container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + select.base_dir = "nonexistent-container/"; + ASSERT_RAISES(IOError, fs_->GetFileInfo(select)); + select.base_dir = "container/"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorRecursive) { + SetUpSmallFileSystemTree(); + + FileSelector select; + select.recursive = true; + + std::vector infos; + // Root dir + select.base_dir = ""; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 14); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertInfoAllContainersRecursive(infos); + + // Empty container + select.base_dir = "empty-container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty container + select.base_dir = "container"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 10); + AssertFileInfo(infos[0], "container/emptydir", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[4], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[5], "container/otherdir/1/2/3/otherfile", FileType::File, 10); + AssertFileInfo(infos[6], "container/somedir", FileType::Directory); + AssertFileInfo(infos[7], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[8], "container/somedir/subdir/subfile", FileType::File, 8); + AssertFileInfo(infos[9], "container/somefile", FileType::File, 9); + + // Empty "directory" + select.base_dir = "container/emptydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + // Non-empty "directories" + select.base_dir = "container/somedir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 2); + AssertFileInfo(infos[0], "container/somedir/subdir", FileType::Directory); + AssertFileInfo(infos[1], "container/somedir/subdir/subfile", FileType::File, 8); + + select.base_dir = "container/otherdir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos, SortedInfos(infos)); + ASSERT_EQ(infos.size(), 4); + AssertFileInfo(infos[0], "container/otherdir/1", FileType::Directory); + AssertFileInfo(infos[1], "container/otherdir/1/2", FileType::Directory); + AssertFileInfo(infos[2], "container/otherdir/1/2/3", FileType::Directory); + AssertFileInfo(infos[3], "container/otherdir/1/2/3/otherfile", FileType::File, 10); +} + +TEST_F(AzuriteFileSystemTest, GetFileInfoSelectorExplicitImplicitDirDedup) { + { + auto container = CreateContainer("container"); + CreateBlob(container, "mydir/emptydir1/"); + CreateBlob(container, "mydir/emptydir2/"); + CreateBlob(container, "mydir/nonemptydir1/"); // explicit dir marker + CreateBlob(container, "mydir/nonemptydir1/somefile", kSomeData); + CreateBlob(container, "mydir/nonemptydir2/somefile", kSomeData); + } + std::vector infos; + + FileSelector select; // non-recursive + select.base_dir = "container"; + + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir", FileType::Directory); + + select.base_dir = "container/mydir"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 4); + ASSERT_EQ(infos, SortedInfos(infos)); + AssertFileInfo(infos[0], "container/mydir/emptydir1", FileType::Directory); + AssertFileInfo(infos[1], "container/mydir/emptydir2", FileType::Directory); + AssertFileInfo(infos[2], "container/mydir/nonemptydir1", FileType::Directory); + AssertFileInfo(infos[3], "container/mydir/nonemptydir2", FileType::Directory); + + select.base_dir = "container/mydir/emptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/emptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 0); + + select.base_dir = "container/mydir/nonemptydir1"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir1/somefile", FileType::File); + + select.base_dir = "container/mydir/nonemptydir2"; + ASSERT_OK_AND_ASSIGN(infos, fs_->GetFileInfo(select)); + ASSERT_EQ(infos.size(), 1); + AssertFileInfo(infos[0], "container/mydir/nonemptydir2/somefile", FileType::File); +} + TEST_F(AzuriteFileSystemTest, CreateDirFailureNoContainer) { ASSERT_RAISES(Invalid, fs_->CreateDir("", false)); } diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 9ecc4610f3864..810e9c179b156 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -654,8 +654,7 @@ Status CopyFiles(const std::shared_ptr& source_fs, "', which is outside base dir '", source_sel.base_dir, "'"); } - auto destination_path = - internal::ConcatAbstractPath(destination_base_dir, std::string(*relative)); + auto destination_path = internal::ConcatAbstractPath(destination_base_dir, *relative); if (source_info.IsDirectory()) { dirs.push_back(destination_path); diff --git a/cpp/src/arrow/filesystem/path_util.cc b/cpp/src/arrow/filesystem/path_util.cc index 46ea436a9f31a..9c895ae76c7b8 100644 --- a/cpp/src/arrow/filesystem/path_util.cc +++ b/cpp/src/arrow/filesystem/path_util.cc @@ -52,7 +52,7 @@ std::vector SplitAbstractPath(const std::string& path, char sep) { } auto append_part = [&parts, &v](size_t start, size_t end) { - parts.push_back(std::string(v.substr(start, end - start))); + parts.emplace_back(v.substr(start, end - start)); }; size_t start = 0; @@ -72,15 +72,12 @@ std::string SliceAbstractPath(const std::string& s, int offset, int length, char return ""; } std::vector components = SplitAbstractPath(s, sep); - std::stringstream combined; if (offset >= static_cast(components.size())) { return ""; } - int end = offset + length; - if (end > static_cast(components.size())) { - end = static_cast(components.size()); - } - for (int i = offset; i < end; i++) { + const auto end = std::min(static_cast(offset) + length, components.size()); + std::stringstream combined; + for (auto i = static_cast(offset); i < end; i++) { combined << components[i]; if (i < end - 1) { combined << sep; @@ -140,16 +137,20 @@ Status ValidateAbstractPathParts(const std::vector& parts) { return Status::OK(); } -std::string ConcatAbstractPath(const std::string& base, const std::string& stem) { +std::string ConcatAbstractPath(std::string_view base, std::string_view stem) { DCHECK(!stem.empty()); if (base.empty()) { - return stem; + return std::string{stem}; } - return EnsureTrailingSlash(base) + std::string(RemoveLeadingSlash(stem)); + std::string result; + result.reserve(base.length() + stem.length() + 1); // extra 1 is for potential kSep + result += EnsureTrailingSlash(base); + result += RemoveLeadingSlash(stem); + return result; } std::string EnsureTrailingSlash(std::string_view v) { - if (v.length() > 0 && v.back() != kSep) { + if (!v.empty() && !HasTrailingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "C:/"... // Unless the local filesystem always uses absolute paths return std::string(v) + kSep; @@ -159,7 +160,7 @@ std::string EnsureTrailingSlash(std::string_view v) { } std::string EnsureLeadingSlash(std::string_view v) { - if (v.length() == 0 || v.front() != kSep) { + if (!HasLeadingSlash(v)) { // XXX How about "C:" on Windows? We probably don't want to turn it into "/C:"... return kSep + std::string(v); } else { @@ -197,10 +198,6 @@ Status AssertNoTrailingSlash(std::string_view key) { return Status::OK(); } -bool HasTrailingSlash(std::string_view key) { return key.back() == '/'; } - -bool HasLeadingSlash(std::string_view key) { return key.front() == '/'; } - Result MakeAbstractPathRelative(const std::string& base, const std::string& path) { if (base.empty() || base.front() != kSep) { @@ -383,7 +380,7 @@ struct Globber::Impl { Globber::Globber(std::string pattern) : impl_(new Impl(pattern)) {} -Globber::~Globber() {} +Globber::~Globber() = default; bool Globber::Matches(const std::string& path) { return regex_match(path, impl_->pattern_); diff --git a/cpp/src/arrow/filesystem/path_util.h b/cpp/src/arrow/filesystem/path_util.h index 2c8c123e779f4..1da7afd3f9381 100644 --- a/cpp/src/arrow/filesystem/path_util.h +++ b/cpp/src/arrow/filesystem/path_util.h @@ -69,7 +69,7 @@ Status ValidateAbstractPathParts(const std::vector& parts); // Append a non-empty stem to an abstract path. ARROW_EXPORT -std::string ConcatAbstractPath(const std::string& base, const std::string& stem); +std::string ConcatAbstractPath(std::string_view base, std::string_view stem); // Make path relative to base, if it starts with base. Otherwise error out. ARROW_EXPORT @@ -94,11 +94,13 @@ std::string_view RemoveTrailingSlash(std::string_view s, bool preserve_root = fa ARROW_EXPORT Status AssertNoTrailingSlash(std::string_view s); -ARROW_EXPORT -bool HasTrailingSlash(std::string_view s); +inline bool HasTrailingSlash(std::string_view s) { + return !s.empty() && s.back() == kSep; +} -ARROW_EXPORT -bool HasLeadingSlash(std::string_view s); +inline bool HasLeadingSlash(std::string_view s) { + return !s.empty() && s.front() == kSep; +} ARROW_EXPORT bool IsAncestorOf(std::string_view ancestor, std::string_view descendant); diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 6c5dda8e659df..040917dcd218a 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -126,6 +126,12 @@ void SortInfos(std::vector* infos) { std::sort(infos->begin(), infos->end(), FileInfo::ByPath{}); } +std::vector SortedInfos(const std::vector& infos) { + auto sorted = infos; + SortInfos(&sorted); + return sorted; +} + void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos) { auto fut = CollectAsyncGenerator(gen); ASSERT_FINISHES_OK_AND_ASSIGN(auto nested_infos, fut); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index c4d846fd31b34..62b488e159a24 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -74,6 +74,10 @@ void CreateFile(FileSystem* fs, const std::string& path, const std::string& data ARROW_TESTING_EXPORT void SortInfos(FileInfoVector* infos); +// Create a copy of a FileInfo vector sorted by lexicographic path order +ARROW_TESTING_EXPORT +FileInfoVector SortedInfos(const FileInfoVector& infos); + ARROW_TESTING_EXPORT void CollectFileInfoGenerator(FileInfoGenerator gen, FileInfoVector* out_infos);