diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index d575faff822aa..91ff44369982a 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -117,6 +117,7 @@ set(ARROW_SRCS filesystem/filesystem.cc filesystem/localfs.cc filesystem/mockfs.cc + filesystem/path_tree.cc filesystem/path_util.cc filesystem/util_internal.cc io/buffered.cc diff --git a/cpp/src/arrow/filesystem/CMakeLists.txt b/cpp/src/arrow/filesystem/CMakeLists.txt index 148fa7484037b..efb78055a9aa3 100644 --- a/cpp/src/arrow/filesystem/CMakeLists.txt +++ b/cpp/src/arrow/filesystem/CMakeLists.txt @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem") add_arrow_test(filesystem_test) add_arrow_test(localfs_test) +add_arrow_test(path_tree_test) if(ARROW_S3) add_arrow_test(s3fs_test) diff --git a/cpp/src/arrow/filesystem/filesystem.cc b/cpp/src/arrow/filesystem/filesystem.cc index 9ec5735a9cf5c..aa9677e45f261 100644 --- a/cpp/src/arrow/filesystem/filesystem.cc +++ b/cpp/src/arrow/filesystem/filesystem.cc @@ -71,6 +71,10 @@ std::string FileStats::base_name() const { return internal::GetAbstractPathParent(path_).second; } +std::string FileStats::dir_name() const { + return internal::GetAbstractPathParent(path_).first; +} + // Debug helper std::ostream& operator<<(std::ostream& os, const FileStats& stats) { return os << "FileStats(" << stats.type() << ", " << stats.path() << ")"; diff --git a/cpp/src/arrow/filesystem/filesystem.h b/cpp/src/arrow/filesystem/filesystem.h index b4e38a3c2e20d..bc17346894eb9 100644 --- a/cpp/src/arrow/filesystem/filesystem.h +++ b/cpp/src/arrow/filesystem/filesystem.h @@ -97,6 +97,9 @@ struct ARROW_EXPORT FileStats { /// The file base name (component after the last directory separator) std::string base_name() const; + // The directory base name (component before the file base name). + std::string dir_name() const; + /// The size in bytes, if available /// /// Only regular files are guaranteed to have a size. @@ -110,6 +113,9 @@ struct ARROW_EXPORT FileStats { TimePoint mtime() const { return mtime_; } void set_mtime(TimePoint mtime) { mtime_ = mtime; } + bool IsFile() const { return type_ == FileType::File; } + bool IsDirectory() const { return type_ == FileType::Directory; } + bool operator==(const FileStats& other) const { return type() == other.type() && path() == other.path() && size() == other.size() && mtime() == other.mtime(); diff --git a/cpp/src/arrow/filesystem/path_tree.cc b/cpp/src/arrow/filesystem/path_tree.cc new file mode 100644 index 0000000000000..682d3ba10215c --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree.cc @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// + +#include "arrow/filesystem/path_tree.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/filesystem/path_util.h" + +namespace arrow { +namespace fs { + +using PathTreeByPathMap = std::unordered_map>; + +std::shared_ptr FindAncestor(const PathTreeByPathMap& directories, + std::string path) { + while (path != "") { + auto parent = internal::GetAbstractPathParent(path).first; + auto found = directories.find(parent); + if (found != directories.end()) { + return found->second; + } + + path = std::move(parent); + } + + return nullptr; +} + +Status PathTree::Make(std::vector stats, PathForest* out) { + PathTreeByPathMap directories; + PathForest forest; + + auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) { + if (s.path() == "") { + return; + } + + auto ancestor = FindAncestor(directories, s.path()); + auto node = std::make_shared(s); + if (ancestor) { + ancestor->AddChild(node); + } else { + forest.push_back(node); + } + + if (s.type() == FileType::Directory) { + directories[s.path()] = node; + } + }; + + // Insert nodes by ascending path length, ensuring that nodes are always + // inserted after their ancestors. Note that this strategy does not account + // for special directories like '..'. It is expected that path are absolute. + auto cmp = [](const FileStats& lhs, const FileStats& rhs) { + return lhs.path().size() < rhs.path().size(); + }; + std::stable_sort(stats.begin(), stats.end(), cmp); + std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root); + + *out = std::move(forest); + return Status::OK(); +} + +Status PathTree::Make(std::vector stats, std::shared_ptr* out) { + PathForest forest; + RETURN_NOT_OK(Make(stats, &forest)); + + auto size = forest.size(); + if (size > 1) { + return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1."); + } else if (size == 1) { + *out = forest[0]; + } + + return Status::OK(); +} + +std::ostream& operator<<(std::ostream& os, const PathTree& tree) { + os << "PathTree(" << tree.stats(); + + const auto& subtrees = tree.subtrees(); + if (subtrees.size()) { + os << ", ["; + for (size_t i = 0; i < subtrees.size(); i++) { + if (i != 0) os << ", "; + os << *subtrees[i]; + } + os << "]"; + } + os << ")"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const std::shared_ptr& tree) { + if (tree != nullptr) { + return os << *tree.get(); + } + + return os; +} + +bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) { + if (lhs == NULLPTR && rhs == NULLPTR) { + return true; + } else if (lhs != NULLPTR && rhs != NULLPTR) { + return *lhs == *rhs; + } + + return false; +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree.h b/cpp/src/arrow/filesystem/path_tree.h new file mode 100644 index 0000000000000..50ec1f9470450 --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree.h @@ -0,0 +1,110 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/filesystem/filesystem.h" + +#include +#include +#include +#include +#include + +#include "arrow/status.h" + +namespace arrow { +namespace fs { + +class ARROW_EXPORT PathTree; + +/// \brief A PathForest consists of multiples PathTree +using PathForest = std::vector>; + +/// \brief A PathTree is a utility to transform a vector of FileStats into a +/// forest representation for tree traversal purposes. Node in the graph wraps +/// a FileStats. Files are expected to be found only at leaves of the tree. +class ARROW_EXPORT PathTree { + public: + explicit PathTree(FileStats stats) : stats_(stats) {} + PathTree(FileStats stats, std::vector> subtrees) + : stats_(stats), subtrees_(std::move(subtrees)) {} + + /// \brief Transforms a FileStats vector into a forest of trees. Since there + /// is no guarantee of complete trees, it is possible to have a forest + /// (multiple roots). The caller should ensure that stats have unique path. + static Status Make(std::vector stats, PathForest* out); + + /// \brief Like MakeForest but fails if there's more than one root. + static Status Make(std::vector stats, std::shared_ptr* out); + + /// \brief Returns the FileStat of this node. + FileStats stats() const { return stats_; } + /// \brief Returns the subtrees under this node. + std::vector> subtrees() const { return subtrees_; } + + /// \brief Visit with eager pruning. + template + Status Visit(Visitor&& v, Matcher&& m) const { + bool match = false; + ARROW_RETURN_NOT_OK(m(stats_, &match)); + if (!match) { + return Status::OK(); + } + + ARROW_RETURN_NOT_OK(v(stats_)); + + for (const auto& t : subtrees_) { + ARROW_RETURN_NOT_OK(t->Visit(v, m)); + } + + return Status::OK(); + } + + template + Status Visit(Visitor&& v) const { + auto always_match = [](const FileStats& t, bool* match) { + *match = true; + return Status::OK(); + }; + return Visit(v, always_match); + } + + bool operator==(const PathTree& other) const { + return stats_ == other.stats_ && subtrees_ == other.subtrees_; + } + + protected: + FileStats stats_; + std::vector> subtrees_; + + // The AddChild method is convenient to create trees in a top-down fashion, + // e.g. the Make factory constructor. + void AddChild(std::shared_ptr child) { + subtrees_.push_back(std::move(child)); + } +}; + +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, + const std::shared_ptr& tree); +ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree); + +ARROW_EXPORT bool operator==(const std::shared_ptr& lhs, + const std::shared_ptr& rhs); + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/path_tree_test.cc b/cpp/src/arrow/filesystem/path_tree_test.cc new file mode 100644 index 0000000000000..fb38ad45835ef --- /dev/null +++ b/cpp/src/arrow/filesystem/path_tree_test.cc @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/filesystem/path_tree.h" + +#include +#include +#include + +#include +#include + +#include "arrow/filesystem/path_util.h" +#include "arrow/filesystem/test_util.h" +#include "arrow/testing/gtest_util.h" + +using testing::ContainerEq; + +namespace arrow { +namespace fs { + +static std::shared_ptr PT(FileStats stats) { + return std::make_shared(std::move(stats)); +} + +static std::shared_ptr PT(FileStats stats, + std::vector> subtrees) { + return std::make_shared(std::move(stats), std::move(subtrees)); +} + +void AssertMakePathTree(std::vector stats, + std::vector> expected) { + std::vector> actual; + + ASSERT_OK(PathTree::Make(stats, &actual)); + EXPECT_THAT(actual, ContainerEq(expected)); +} + +TEST(TestPathTree, Basic) { + AssertMakePathTree({}, {}); + + AssertMakePathTree({File("aa")}, {PT(File("aa"))}); + AssertMakePathTree({Dir("AA")}, {PT(Dir("AA"))}); + AssertMakePathTree({Dir("AA"), File("AA/aa")}, {PT(Dir("AA"), {PT(File("AA/aa"))})}); + + // Missing parent can still find ancestor. + AssertMakePathTree({Dir("AA"), File("AA/BB/bb")}, + {PT(Dir("AA"), {PT(File("AA/BB/bb"))})}); + + // Ancestors should link to parent irregardless of the ordering + AssertMakePathTree({File("AA/aa"), Dir("AA")}, {PT(Dir("AA"), {PT(File("AA/aa"))})}); + + // Multiple roots are supported. + AssertMakePathTree({File("aa"), File("bb")}, {PT(File("aa")), PT(File("bb"))}); + AssertMakePathTree( + {File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")}, + {PT(File("00")), PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("BB/bb"))}); +} + +TEST(TestPathTree, HourlyETL) { + // This test mimics a scenario where an ETL dumps hourly files in a structure + // `$year/$month/$day/$hour/*.parquet`. + + constexpr int64_t kYears = 8; + constexpr int64_t kMonthsPerYear = 12; + constexpr int64_t kDaysPerMonth = 31; + constexpr int64_t kHoursPerDay = 24; + constexpr int64_t kFilesPerHour = 4; + + // Avoid constructing strings + std::vector numbers{kDaysPerMonth + 1}; + for (size_t i = 0; i < numbers.size(); i++) { + numbers[i] = std::to_string(i); + } + + auto join = [](const std::vector& path) { + return internal::JoinAbstractPath(path); + }; + + std::vector stats; + + PathForest forest; + for (int64_t year = 0; year < kYears; year++) { + auto year_str = std::to_string(year + 2000); + auto year_dir = Dir(year_str); + stats.push_back(year_dir); + + PathForest months; + for (int64_t month = 0; month < kMonthsPerYear; month++) { + auto month_str = join({year_str, numbers[month + 1]}); + auto month_dir = Dir(month_str); + stats.push_back(month_dir); + + PathForest days; + for (int64_t day = 0; day < kDaysPerMonth; day++) { + auto day_str = join({month_str, numbers[day + 1]}); + auto day_dir = Dir(day_str); + stats.push_back(day_dir); + + PathForest hours; + for (int64_t hour = 0; hour < kHoursPerDay; hour++) { + auto hour_str = join({day_str, numbers[hour]}); + auto hour_dir = Dir(hour_str); + stats.push_back(hour_dir); + + PathForest files; + for (int64_t file = 0; file < kFilesPerHour; file++) { + auto file_str = join({hour_str, numbers[file] + ".parquet"}); + auto file_fd = File(file_str); + stats.push_back(file_fd); + files.push_back(PT(file_fd)); + } + + auto hour_pt = PT(hour_dir, std::move(files)); + hours.push_back(hour_pt); + } + + auto day_pt = PT(day_dir, std::move(hours)); + days.push_back(day_pt); + } + + auto month_pt = PT(month_dir, std::move(days)); + months.push_back(month_pt); + } + + auto year_pt = PT(year_dir, std::move(months)); + forest.push_back(year_pt); + } + + AssertMakePathTree(stats, forest); +} + +TEST(TestPathTree, Visit) { + std::shared_ptr tree; + ASSERT_OK(PathTree::Make({Dir("A"), File("A/a")}, &tree)); + + // Should propagate failure + auto visit_noop = [](const FileStats&) { return Status::OK(); }; + ASSERT_OK(tree->Visit(visit_noop)); + auto visit_fail = [](const FileStats&) { return Status::Invalid(""); }; + ASSERT_RAISES(Invalid, tree->Visit(visit_fail)); + auto match_fail = [](const FileStats&, bool* match) { return Status::Invalid(""); }; + ASSERT_RAISES(Invalid, tree->Visit(visit_noop, match_fail)); + + // Ensure basic visit of all nodes + std::vector collect; + auto visit = [&collect](const FileStats& f) { + collect.push_back(f); + return Status::OK(); + }; + ASSERT_OK(tree->Visit(visit)); + EXPECT_THAT(collect, ContainerEq(std::vector{Dir("A"), File("A/a")})); + + // Matcher should be evaluated on all nodes + collect.resize(0); + auto match_dir = [](const FileStats& s, bool* m) { + *m = s.IsDirectory(); + return Status::OK(); + }; + ASSERT_OK(tree->Visit(visit, match_dir)); + EXPECT_THAT(collect, ContainerEq(std::vector{Dir("A")})); +} + +} // namespace fs +} // namespace arrow diff --git a/cpp/src/arrow/filesystem/test_util.cc b/cpp/src/arrow/filesystem/test_util.cc index 47f423ca20029..6786e3973a431 100644 --- a/cpp/src/arrow/filesystem/test_util.cc +++ b/cpp/src/arrow/filesystem/test_util.cc @@ -649,20 +649,6 @@ void GenericFileSystemTest::TestGetTargetStatsSelector(FileSystem* fs) { ASSERT_RAISES(IOError, fs->GetTargetStats(s, &stats)); } -FileStats File(std::string path) { - FileStats st; - st.set_type(FileType::File); - st.set_path(path); - return st; -} - -FileStats Dir(std::string path) { - FileStats st; - st.set_type(FileType::Directory); - st.set_path(path); - return st; -} - void GetSortedStats(FileSystem* fs, Selector s, std::vector& stats) { ASSERT_OK(fs->GetTargetStats(s, &stats)); // Clear mtime & size for easier testing. diff --git a/cpp/src/arrow/filesystem/test_util.h b/cpp/src/arrow/filesystem/test_util.h index e9c7f708d57ff..b7edc357243a4 100644 --- a/cpp/src/arrow/filesystem/test_util.h +++ b/cpp/src/arrow/filesystem/test_util.h @@ -29,6 +29,20 @@ namespace fs { static constexpr double kTimeSlack = 2.0; // In seconds +static inline FileStats File(std::string path) { + FileStats st; + st.set_type(FileType::File); + st.set_path(path); + return st; +} + +static inline FileStats Dir(std::string path) { + FileStats st; + st.set_type(FileType::Directory); + st.set_path(path); + return st; +} + ARROW_EXPORT void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);