Skip to content

Commit

Permalink
Simplify implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
fsaintjacques committed Sep 26, 2019
1 parent 109ea85 commit 60b5945
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 130 deletions.
142 changes: 70 additions & 72 deletions cpp/src/arrow/filesystem/path_tree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,110 +19,73 @@
#include "arrow/filesystem/path_tree.h"

#include <algorithm>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "arrow/filesystem/path_util.h"

namespace arrow {
namespace fs {

static int PathDepth(const std::string& path) {
return std::count(path.begin(), path.end(), internal::kSep);
}

using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>;

template <FileType Type>
bool IsType(const std ::shared_ptr<PathTree>& tree) {
return tree->stats().type() == Type;
}

std::shared_ptr<PathTree> FindAncestor(PathTreeByPathMap* directories, std::string path) {
std::shared_ptr<PathTree> FindAncestor(const PathTreeByPathMap& directories,
std::string path) {
while (path != "") {
auto parent = internal::GetAbstractPathParent(path).first;
auto found = directories->find(parent);
if (found != directories->end()) {
auto found = directories.find(parent);
if (found != directories.end()) {
return found->second;
}

path = parent;
path = std::move(parent);
}

return nullptr;
}

static void LinkToParentOrInsertNewRoot(FileStats stats, PathTreeByPathMap* directories,
PathTreeForest* forest) {
auto node = std::make_shared<PathTree>(stats);
if (stats.path() == "") {
forest->push_back(node);
return;
}

auto ancestor = FindAncestor(directories, stats.path());
if (ancestor) {
ancestor->AddChild(node);
} else {
forest->push_back(node);
}

if (IsType<FileType::Directory>(node)) {
directories->insert({stats.path(), node});
}
}

using DirectoryByDepthMap = std::unordered_map<int, std::vector<FileStats>>;
Status PathTree::Make(std::vector<FileStats> stats, PathForest* out) {
PathTreeByPathMap directories;
PathForest forest;

std::vector<int> OrderedDepths(const DirectoryByDepthMap& directories_by_depth) {
std::vector<int> depths;
for (auto k_v : directories_by_depth) {
depths.push_back(k_v.first);
}
auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) {
if (s.path() == "") {
return;
}

// In practice, this is going to be O(lg(n)lg(lg(n))), i.e. constant.
std::sort(depths.begin(), depths.end());
return depths;
}
auto ancestor = FindAncestor(directories, s.path());
auto node = std::make_shared<PathTree>(s);
if (ancestor) {
ancestor->AddChild(node);
} else {
forest.push_back(node);
}

Status PathTree::Make(std::vector<FileStats> stats, PathTreeForest* out) {
PathTreeByPathMap directories;
PathTreeForest forest;

// Partition the stats vector into (directories, others)
auto is_directory = [](const FileStats& stats) { return stats.IsDirectory(); };
std::stable_partition(stats.begin(), stats.end(), is_directory);
auto mid = std::partition_point(stats.begin(), stats.end(), is_directory);

// First, partition directories by path depth.
DirectoryByDepthMap directories_by_depth;
std::for_each(stats.begin(), mid, [&directories_by_depth](FileStats s) {
directories_by_depth[PathDepth(s.path())].push_back(s);
});

// Insert directories by ascending depth, ensuring that children directories
// are always inserted after ancestors.
for (int d : OrderedDepths(directories_by_depth)) {
auto dir = directories_by_depth.at(d);
std::for_each(dir.begin(), dir.end(), [&directories, &forest](FileStats s) {
LinkToParentOrInsertNewRoot(s, &directories, &forest);
});
}
if (s.type() == FileType::Directory) {
directories[s.path()] = node;
}
};

// Second, ingest files. By the same logic, directories are added before
// files, hence the lookup for ancestors is valid.
std::for_each(mid, stats.end(), [&directories, &forest](FileStats s) {
LinkToParentOrInsertNewRoot(s, &directories, &forest);
});
// Insert nodes by ascending path length, ensuring that nodes are always
// inserted after their ancestors. Note that this strategy does not account
// for special directories like '..'. It is expected that path are absolute.
auto cmp = [](const FileStats& lhs, const FileStats& rhs) {
return lhs.path().size() < rhs.path().size();
};
std::stable_sort(stats.begin(), stats.end(), cmp);
std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root);

*out = std::move(forest);
return Status::OK();
}

Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) {
PathTreeForest forest;
PathForest forest;
RETURN_NOT_OK(Make(stats, &forest));

auto size = forest.size();
Expand All @@ -135,5 +98,40 @@ Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* o
return Status::OK();
}

std::ostream& operator<<(std::ostream& os, const PathTree& tree) {
os << "PathTree(" << tree.stats();

const auto& subtrees = tree.subtrees();
if (subtrees.size()) {
os << ", [";
for (size_t i = 0; i < subtrees.size(); i++) {
if (i != 0) os << ", ";
os << *subtrees[i];
}
os << "]";
}
os << ")";
return os;
}

std::ostream& operator<<(std::ostream& os, const std::shared_ptr<PathTree>& tree) {
if (tree != nullptr) {
return os << *tree.get();
}

return os;
}

bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs) {
if (lhs == NULLPTR && rhs == NULLPTR) {
return true;
} else if (lhs != NULLPTR && rhs != NULLPTR) {
return *lhs == *rhs;
}

return false;
}

} // namespace fs
} // namespace arrow
40 changes: 26 additions & 14 deletions cpp/src/arrow/filesystem/path_tree.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "arrow/filesystem/filesystem.h"

#include <algorithm>
#include <iosfwd>
#include <memory>
#include <utility>
#include <vector>
Expand All @@ -29,8 +30,8 @@ namespace fs {

class ARROW_EXPORT PathTree;

/// \brief A PathTreeForest consists of multiples PathTree
using PathTreeForest = std::vector<std::shared_ptr<PathTree>>;
/// \brief A PathForest consists of multiples PathTree
using PathForest = std::vector<std::shared_ptr<PathTree>>;

/// \brief A PathTree is a utility to transform a vector of FileStats into a
/// forest representation for tree traversal purposes. Node in the graph wraps
Expand All @@ -44,7 +45,7 @@ class ARROW_EXPORT PathTree {
/// \brief Transforms a FileStats vector into a forest of trees. Since there
/// is no guarantee of complete trees, it is possible to have a forest
/// (multiple roots). The caller should ensure that stats have unique path.
static Status Make(std::vector<FileStats> stats, PathTreeForest* out);
static Status Make(std::vector<FileStats> stats, PathForest* out);

/// \brief Like MakeForest but fails if there's more than one root.
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out);
Expand All @@ -54,14 +55,6 @@ class ARROW_EXPORT PathTree {
/// \brief Returns the subtrees under this node.
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; }

template <typename Visitor>
void Visit(Visitor v) const {
v(stats_);

auto recurse = [&v](const std::shared_ptr<PathTree>& tree) { tree->Visit(v); };
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse);
}

/// \brief Visit with eager pruning.
template <typename Visitor, typename Matcher>
void Visit(Visitor v, Matcher m) const {
Expand All @@ -71,18 +64,37 @@ class ARROW_EXPORT PathTree {

v(stats_);

auto recurse = [&v, &m](const std::shared_ptr<PathTree>& tree) { tree->Visit(v, m); };
auto recurse = [&v, &m](const std::shared_ptr<PathTree>& t) { t->Visit(v, m); };
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse);
}

void AddChild(std::shared_ptr<PathTree> child) {
subtrees_.push_back(std::move(child));
template <typename Visitor>
void Visit(Visitor v) const {
auto always_match = [](const std::shared_ptr<PathTree>& t) { return true; };
return Visit(std::move(v), always_match);
}

bool operator==(const PathTree& other) const {
return stats_ == other.stats_ && subtrees_ == other.subtrees_;
}

protected:
FileStats stats_;
std::vector<std::shared_ptr<PathTree>> subtrees_;

// The AddChild method is conveninent to create trees in a top-down fashion,
// e.g. the Make factory constructor.
void AddChild(std::shared_ptr<PathTree> child) {
subtrees_.push_back(std::move(child));
}
};

ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const std::shared_ptr<PathTree>& tree);
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree);

ARROW_EXPORT bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs);

} // namespace fs
} // namespace arrow
55 changes: 20 additions & 35 deletions cpp/src/arrow/filesystem/path_tree_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,25 +37,7 @@ static std::shared_ptr<PathTree> PT(FileStats stats) {

static std::shared_ptr<PathTree> PT(FileStats stats,
std::vector<std::shared_ptr<PathTree>> subtrees) {
return std::make_shared<PathTree>(stats, std::move(subtrees));
}

// Utility functions to help testing/debugging
std::ostream& operator<<(std::ostream& os, const PathTree& tree) {
return os << "PathTree(" << tree.stats() << ")";
}

std::ostream& operator<<(std::ostream& os, const std::shared_ptr<PathTree>& tree) {
return os << *tree;
}

bool operator==(const PathTree& lhs, const PathTree& rhs) {
return lhs.stats() == rhs.stats() && lhs.subtrees() == rhs.subtrees();
}

bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs) {
return *lhs == *rhs;
return std::make_shared<PathTree>(std::move(stats), std::move(subtrees));
}

void AssertMakePathTree(std::vector<FileStats> stats,
Expand All @@ -79,11 +61,9 @@ TEST(TestPathTree, Basic) {

// Multiple roots are supported.
AssertMakePathTree({File("aa"), File("bb")}, {PT(File("aa")), PT(File("bb"))});
// Note that an implementation detail is leaking in this test, directories
// are always first in the forest.
AssertMakePathTree(
{File("00"), Dir("AA"), File("AA/aa"), File("BB/bb")},
{PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("00")), PT(File("BB/bb"))});
{PT(File("00")), PT(Dir("AA"), {PT(File("AA/aa"))}), PT(File("BB/bb"))});
}

TEST(TestPathTree, HourlyETL) {
Expand All @@ -107,48 +87,53 @@ TEST(TestPathTree, HourlyETL) {
};

std::vector<FileStats> stats;
std::vector<std::shared_ptr<PathTree>> forest;

PathForest forest;
for (int64_t year = 0; year < kYears; year++) {
auto year_str = std::to_string(year + 2000);
auto year_dir = Dir(year_str);
stats.push_back(year_dir);

auto year_pt = PT(year_dir);
// years are roots in the forest
forest.push_back(year_pt);
PathForest months;
for (int64_t month = 0; month < kMonthsPerYear; month++) {
auto month_str = join({year_str, numbers[month + 1]});
auto month_dir = Dir(month_str);
stats.push_back(month_dir);

auto month_pt = PT(month_dir);
year_pt->AddChild(month_pt);
PathForest days;
for (int64_t day = 0; day < kDaysPerMonth; day++) {
auto day_str = join({month_str, numbers[day + 1]});
auto day_dir = Dir(day_str);
stats.push_back(day_dir);

auto day_pt = PT(day_dir);
month_pt->AddChild(day_pt);
PathForest hours;
for (int64_t hour = 0; hour < kHoursPerDay; hour++) {
auto hour_str = join({day_str, numbers[hour]});
auto hour_dir = Dir(hour_str);
stats.push_back(hour_dir);

auto hour_pt = PT(hour_dir);
day_pt->AddChild(hour_pt);
PathForest files;
for (int64_t file = 0; file < kFilesPerHour; file++) {
auto file_str = join({hour_str, numbers[file] + ".parquet"});
auto file_fd = File(file_str);
stats.push_back(file_fd);

auto file_pt = PT(file_fd);
hour_pt->AddChild(file_pt);
files.push_back(PT(file_fd));
}

auto hour_pt = PT(hour_dir, std::move(files));
hours.push_back(hour_pt);
}

auto day_pt = PT(day_dir, std::move(hours));
days.push_back(day_pt);
}

auto month_pt = PT(month_dir, std::move(days));
months.push_back(month_pt);
}

auto year_pt = PT(year_dir, std::move(months));
forest.push_back(year_pt);
}

AssertMakePathTree(stats, forest);
Expand Down
9 changes: 0 additions & 9 deletions cpp/src/arrow/filesystem/test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,6 @@ static inline FileStats Dir(std::string path) {
return st;
}

// Utility functions to help testing/debugging
std::ostream& operator<<(std::ostream& os, const FileStats& stats) {
return os << "FileStats(" << stats.type() << ", " << stats.path() << ")";
}

bool operator==(const FileStats& lhs, const FileStats& rhs) {
return lhs.type() == rhs.type() && lhs.path() == rhs.path();
}

ARROW_EXPORT
void CreateFile(FileSystem* fs, const std::string& path, const std::string& data);

Expand Down

0 comments on commit 60b5945

Please sign in to comment.