diff --git a/cpp/src/arrow/filesystem/path_tree.cc b/cpp/src/arrow/filesystem/path_tree.cc index 42e39f592f644..b234e35c99796 100644 --- a/cpp/src/arrow/filesystem/path_tree.cc +++ b/cpp/src/arrow/filesystem/path_tree.cc @@ -19,110 +19,73 @@ #include "arrow/filesystem/path_tree.h" #include +#include +#include #include #include #include #include +#include #include "arrow/filesystem/path_util.h" namespace arrow { namespace fs { -static int PathDepth(const std::string& path) { - return std::count(path.begin(), path.end(), internal::kSep); -} - using PathTreeByPathMap = std::unordered_map>; -template -bool IsType(const std ::shared_ptr& tree) { - return tree->stats().type() == Type; -} - -std::shared_ptr FindAncestor(PathTreeByPathMap* directories, std::string path) { +std::shared_ptr FindAncestor(const PathTreeByPathMap& directories, + std::string path) { while (path != "") { auto parent = internal::GetAbstractPathParent(path).first; - auto found = directories->find(parent); - if (found != directories->end()) { + auto found = directories.find(parent); + if (found != directories.end()) { return found->second; } - path = parent; + path = std::move(parent); } return nullptr; } -static void LinkToParentOrInsertNewRoot(FileStats stats, PathTreeByPathMap* directories, - PathTreeForest* forest) { - auto node = std::make_shared(stats); - if (stats.path() == "") { - forest->push_back(node); - return; - } - - auto ancestor = FindAncestor(directories, stats.path()); - if (ancestor) { - ancestor->AddChild(node); - } else { - forest->push_back(node); - } - - if (IsType(node)) { - directories->insert({stats.path(), node}); - } -} - -using DirectoryByDepthMap = std::unordered_map>; +Status PathTree::Make(std::vector stats, PathForest* out) { + PathTreeByPathMap directories; + PathForest forest; -std::vector OrderedDepths(const DirectoryByDepthMap& directories_by_depth) { - std::vector depths; - for (auto k_v : directories_by_depth) { - depths.push_back(k_v.first); - } + auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) { + if (s.path() == "") { + return; + } - // In practice, this is going to be O(lg(n)lg(lg(n))), i.e. constant. - std::sort(depths.begin(), depths.end()); - return depths; -} + auto ancestor = FindAncestor(directories, s.path()); + auto node = std::make_shared(s); + if (ancestor) { + ancestor->AddChild(node); + } else { + forest.push_back(node); + } -Status PathTree::Make(std::vector stats, PathTreeForest* out) { - PathTreeByPathMap directories; - PathTreeForest forest; - - // Partition the stats vector into (directories, others) - auto is_directory = [](const FileStats& stats) { return stats.IsDirectory(); }; - std::stable_partition(stats.begin(), stats.end(), is_directory); - auto mid = std::partition_point(stats.begin(), stats.end(), is_directory); - - // First, partition directories by path depth. - DirectoryByDepthMap directories_by_depth; - std::for_each(stats.begin(), mid, [&directories_by_depth](FileStats s) { - directories_by_depth[PathDepth(s.path())].push_back(s); - }); - - // Insert directories by ascending depth, ensuring that children directories - // are always inserted after ancestors. - for (int d : OrderedDepths(directories_by_depth)) { - auto dir = directories_by_depth.at(d); - std::for_each(dir.begin(), dir.end(), [&directories, &forest](FileStats s) { - LinkToParentOrInsertNewRoot(s, &directories, &forest); - }); - } + if (s.type() == FileType::Directory) { + directories[s.path()] = node; + } + }; - // Second, ingest files. By the same logic, directories are added before - // files, hence the lookup for ancestors is valid. - std::for_each(mid, stats.end(), [&directories, &forest](FileStats s) { - LinkToParentOrInsertNewRoot(s, &directories, &forest); - }); + // Insert nodes by ascending path length, ensuring that nodes are always + // inserted after their ancestors. Note that this strategy does not account + // for special directories like '..'. It is expected that path are absolute. + auto cmp = [](const FileStats& lhs, const FileStats& rhs) { + return lhs.path().size() < rhs.path().size(); + }; + std::stable_sort(stats.begin(), stats.end(), cmp); + std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root); *out = std::move(forest); return Status::OK(); } Status PathTree::Make(std::vector stats, std::shared_ptr* out) { - PathTreeForest forest; + PathForest forest; RETURN_NOT_OK(Make(stats, &forest)); auto size = forest.size(); @@ -135,5 +98,40 @@ Status PathTree::Make(std::vector stats, std::shared_ptr* o return Status::OK(); } +std::ostream& operator<<(std::ostream& os, const PathTree& tree) { + os << "PathTree(" << tree.stats(); + + const auto& subtrees = tree.subtrees(); + if (subtrees.size()) { + os << ", ["; + for (size_t i = 0; i < subtrees.size(); i++) { + if (i != 0) os << ", "; + os << *subtrees[i]; + } + os << "]"; + } + os << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const std::shared_ptr& tree) { + if (tree != nullptr) { + return os << *tree.get(); + } + + return os; +} + +bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == NULLPTR && rhs == NULLPTR) { + return true; + } else if (lhs != NULLPTR && rhs != NULLPTR) { + return *lhs == *rhs; + } + + return false; +} + } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree.h b/cpp/src/arrow/filesystem/path_tree.h index 145917d11a4ec..6add0fd7e6402 100644 --- a/cpp/src/arrow/filesystem/path_tree.h +++ b/cpp/src/arrow/filesystem/path_tree.h @@ -20,6 +20,7 @@ #include "arrow/filesystem/filesystem.h" #include +#include #include #include #include @@ -29,8 +30,8 @@ namespace fs { class ARROW_EXPORT PathTree; -/// \brief A PathTreeForest consists of multiples PathTree -using PathTreeForest = std::vector>; +/// \brief A PathForest consists of multiples PathTree +using PathForest = std::vector>; /// \brief A PathTree is a utility to transform a vector of FileStats into a /// forest representation for tree traversal purposes. Node in the graph wraps @@ -44,7 +45,7 @@ class ARROW_EXPORT PathTree { /// \brief Transforms a FileStats vector into a forest of trees. Since there /// is no guarantee of complete trees, it is possible to have a forest /// (multiple roots). The caller should ensure that stats have unique path. - static Status Make(std::vector stats, PathTreeForest* out); + static Status Make(std::vector stats, PathForest* out); /// \brief Like MakeForest but fails if there's more than one root. static Status Make(std::vector stats, std::shared_ptr* out); @@ -54,14 +55,6 @@ class ARROW_EXPORT PathTree { /// \brief Returns the subtrees under this node. std::vector> subtrees() const { return subtrees_; } - template - void Visit(Visitor v) const { - v(stats_); - - auto recurse = [&v](const std::shared_ptr& tree) { tree->Visit(v); }; - std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse); - } - /// \brief Visit with eager pruning. template void Visit(Visitor v, Matcher m) const { @@ -71,18 +64,37 @@ class ARROW_EXPORT PathTree { v(stats_); - auto recurse = [&v, &m](const std::shared_ptr& tree) { tree->Visit(v, m); }; + auto recurse = [&v, &m](const std::shared_ptr& t) { t->Visit(v, m); }; std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse); } - void AddChild(std::shared_ptr child) { - subtrees_.push_back(std::move(child)); + template + void Visit(Visitor v) const { + auto always_match = [](const std::shared_ptr& t) { return true; }; + return Visit(std::move(v), always_match); + } + + bool operator==(const PathTree& other) const { + return stats_ == other.stats_ && subtrees_ == other.subtrees_; } protected: FileStats stats_; std::vector> subtrees_; + + // The AddChild method is conveninent to create trees in a top-down fashion, + // e.g. the Make factory constructor. + void AddChild(std::shared_ptr child) { + subtrees_.push_back(std::move(child)); + } }; +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, + const std::shared_ptr& tree); +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree); + +ARROW_EXPORT bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs); + } // namespace fs } // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree_test.cc b/cpp/src/arrow/filesystem/path_tree_test.cc index 2138fbb3a9776..86127aba12d5e 100644 --- a/cpp/src/arrow/filesystem/path_tree_test.cc +++ b/cpp/src/arrow/filesystem/path_tree_test.cc @@ -37,25 +37,7 @@ static std::shared_ptr PT(FileStats stats) { static std::shared_ptr PT(FileStats stats, std::vector> subtrees) { - return std::make_shared(stats, std::move(subtrees)); -} - -// Utility functions to help testing/debugging -std::ostream& operator<<(std::ostream& os, const PathTree& tree) { - return os << "PathTree(" << tree.stats() << ")"; -} - -std::ostream& operator<<(std::ostream& os, const std::shared_ptr& tree) { - return os << *tree; -} - -bool operator==(const PathTree& lhs, const PathTree& rhs) { - return lhs.stats() == rhs.stats() && lhs.subtrees() == rhs.subtrees(); -} - -bool operator==(const std::shared_ptr& lhs, - const std::shared_ptr& rhs) { - return *lhs == *rhs; + return std::make_shared(std::move(stats), std::move(subtrees)); } void AssertMakePathTree(std::vector stats, @@ -79,11 +61,9 @@ TEST(TestPathTree, Basic) { // Multiple roots are supported. AssertMakePathTree({File("aa"), File("bb")}, {PT(File("aa")), PT(File("bb"))}); - // Note that an implementation detail is leaking in this test, directories - // are always first in the forest. AssertMakePathTree( {File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")}, - {PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("00")), PT(File("BB/bb"))}); + {PT(File("00")), PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("BB/bb"))}); } TEST(TestPathTree, HourlyETL) { @@ -107,48 +87,53 @@ TEST(TestPathTree, HourlyETL) { }; std::vector stats; - std::vector> forest; + PathForest forest; for (int64_t year = 0; year < kYears; year++) { auto year_str = std::to_string(year + 2000); auto year_dir = Dir(year_str); stats.push_back(year_dir); - auto year_pt = PT(year_dir); - // years are roots in the forest - forest.push_back(year_pt); + PathForest months; for (int64_t month = 0; month < kMonthsPerYear; month++) { auto month_str = join({year_str, numbers[month + 1]}); auto month_dir = Dir(month_str); stats.push_back(month_dir); - auto month_pt = PT(month_dir); - year_pt->AddChild(month_pt); + PathForest days; for (int64_t day = 0; day < kDaysPerMonth; day++) { auto day_str = join({month_str, numbers[day + 1]}); auto day_dir = Dir(day_str); stats.push_back(day_dir); - auto day_pt = PT(day_dir); - month_pt->AddChild(day_pt); + PathForest hours; for (int64_t hour = 0; hour < kHoursPerDay; hour++) { auto hour_str = join({day_str, numbers[hour]}); auto hour_dir = Dir(hour_str); stats.push_back(hour_dir); - auto hour_pt = PT(hour_dir); - day_pt->AddChild(hour_pt); + PathForest files; for (int64_t file = 0; file < kFilesPerHour; file++) { auto file_str = join({hour_str, numbers[file] + ".parquet"}); auto file_fd = File(file_str); stats.push_back(file_fd); - - auto file_pt = PT(file_fd); - hour_pt->AddChild(file_pt); + files.push_back(PT(file_fd)); } + + auto hour_pt = PT(hour_dir, std::move(files)); + hours.push_back(hour_pt); } + + auto day_pt = PT(day_dir, std::move(hours)); + days.push_back(day_pt); } + + auto month_pt = PT(month_dir, std::move(days)); + months.push_back(month_pt); } + + auto year_pt = PT(year_dir, std::move(months)); + forest.push_back(year_pt); } AssertMakePathTree(stats, forest); diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index dd6ebc0501875..b7edc357243a4 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -43,15 +43,6 @@ static inline FileStats Dir(std::string path) { return st; } -// Utility functions to help testing/debugging -std::ostream& operator<<(std::ostream& os, const FileStats& stats) { - return os << "FileStats(" << stats.type() << ", " << stats.path() << ")"; -} - -bool operator==(const FileStats& lhs, const FileStats& rhs) { - return lhs.type() == rhs.type() && lhs.path() == rhs.path(); -} - ARROW_EXPORT void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);