-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-6606: [C++] Add PathTree tree structure
Construct tree structure from std::vector<fs::FileStats>, following the path directory hierarchy. Closes #5430 from fsaintjacques/ARROW-6606-path-tree and squashes the following commits: 43d19fa <François Saint-Jacques> Address comments 60b5945 <François Saint-Jacques> Simplify implementation 109ea85 <François Saint-Jacques> ARROW-6606: Add PathTree tree structure Authored-by: François Saint-Jacques <[email protected]> Signed-off-by: Benjamin Kietzman <[email protected]>
- Loading branch information
1 parent
2dc020c
commit dec0cfb
Showing
9 changed files
with
452 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
// | ||
|
||
#include "arrow/filesystem/path_tree.h" | ||
|
||
#include <algorithm> | ||
#include <iostream> | ||
#include <map> | ||
#include <memory> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "arrow/filesystem/path_util.h" | ||
|
||
namespace arrow { | ||
namespace fs { | ||
|
||
using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>; | ||
|
||
std::shared_ptr<PathTree> FindAncestor(const PathTreeByPathMap& directories, | ||
std::string path) { | ||
while (path != "") { | ||
auto parent = internal::GetAbstractPathParent(path).first; | ||
auto found = directories.find(parent); | ||
if (found != directories.end()) { | ||
return found->second; | ||
} | ||
|
||
path = std::move(parent); | ||
} | ||
|
||
return nullptr; | ||
} | ||
|
||
Status PathTree::Make(std::vector<FileStats> stats, PathForest* out) { | ||
PathTreeByPathMap directories; | ||
PathForest forest; | ||
|
||
auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) { | ||
if (s.path() == "") { | ||
return; | ||
} | ||
|
||
auto ancestor = FindAncestor(directories, s.path()); | ||
auto node = std::make_shared<PathTree>(s); | ||
if (ancestor) { | ||
ancestor->AddChild(node); | ||
} else { | ||
forest.push_back(node); | ||
} | ||
|
||
if (s.type() == FileType::Directory) { | ||
directories[s.path()] = node; | ||
} | ||
}; | ||
|
||
// Insert nodes by ascending path length, ensuring that nodes are always | ||
// inserted after their ancestors. Note that this strategy does not account | ||
// for special directories like '..'. It is expected that path are absolute. | ||
auto cmp = [](const FileStats& lhs, const FileStats& rhs) { | ||
return lhs.path().size() < rhs.path().size(); | ||
}; | ||
std::stable_sort(stats.begin(), stats.end(), cmp); | ||
std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root); | ||
|
||
*out = std::move(forest); | ||
return Status::OK(); | ||
} | ||
|
||
Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) { | ||
PathForest forest; | ||
RETURN_NOT_OK(Make(stats, &forest)); | ||
|
||
auto size = forest.size(); | ||
if (size > 1) { | ||
return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1."); | ||
} else if (size == 1) { | ||
*out = forest[0]; | ||
} | ||
|
||
return Status::OK(); | ||
} | ||
|
||
std::ostream& operator<<(std::ostream& os, const PathTree& tree) { | ||
os << "PathTree(" << tree.stats(); | ||
|
||
const auto& subtrees = tree.subtrees(); | ||
if (subtrees.size()) { | ||
os << ", ["; | ||
for (size_t i = 0; i < subtrees.size(); i++) { | ||
if (i != 0) os << ", "; | ||
os << *subtrees[i]; | ||
} | ||
os << "]"; | ||
} | ||
os << ")"; | ||
return os; | ||
} | ||
|
||
std::ostream& operator<<(std::ostream& os, const std::shared_ptr<PathTree>& tree) { | ||
if (tree != nullptr) { | ||
return os << *tree.get(); | ||
} | ||
|
||
return os; | ||
} | ||
|
||
bool operator==(const std::shared_ptr<PathTree>& lhs, | ||
const std::shared_ptr<PathTree>& rhs) { | ||
if (lhs == NULLPTR && rhs == NULLPTR) { | ||
return true; | ||
} else if (lhs != NULLPTR && rhs != NULLPTR) { | ||
return *lhs == *rhs; | ||
} | ||
|
||
return false; | ||
} | ||
|
||
} // namespace fs | ||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include "arrow/filesystem/filesystem.h" | ||
|
||
#include <algorithm> | ||
#include <iosfwd> | ||
#include <memory> | ||
#include <utility> | ||
#include <vector> | ||
|
||
#include "arrow/status.h" | ||
|
||
namespace arrow { | ||
namespace fs { | ||
|
||
class ARROW_EXPORT PathTree; | ||
|
||
/// \brief A PathForest consists of multiples PathTree | ||
using PathForest = std::vector<std::shared_ptr<PathTree>>; | ||
|
||
/// \brief A PathTree is a utility to transform a vector of FileStats into a | ||
/// forest representation for tree traversal purposes. Node in the graph wraps | ||
/// a FileStats. Files are expected to be found only at leaves of the tree. | ||
class ARROW_EXPORT PathTree { | ||
public: | ||
explicit PathTree(FileStats stats) : stats_(stats) {} | ||
PathTree(FileStats stats, std::vector<std::shared_ptr<PathTree>> subtrees) | ||
: stats_(stats), subtrees_(std::move(subtrees)) {} | ||
|
||
/// \brief Transforms a FileStats vector into a forest of trees. Since there | ||
/// is no guarantee of complete trees, it is possible to have a forest | ||
/// (multiple roots). The caller should ensure that stats have unique path. | ||
static Status Make(std::vector<FileStats> stats, PathForest* out); | ||
|
||
/// \brief Like MakeForest but fails if there's more than one root. | ||
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out); | ||
|
||
/// \brief Returns the FileStat of this node. | ||
FileStats stats() const { return stats_; } | ||
/// \brief Returns the subtrees under this node. | ||
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; } | ||
|
||
/// \brief Visit with eager pruning. | ||
template <typename Visitor, typename Matcher> | ||
Status Visit(Visitor&& v, Matcher&& m) const { | ||
bool match = false; | ||
ARROW_RETURN_NOT_OK(m(stats_, &match)); | ||
if (!match) { | ||
return Status::OK(); | ||
} | ||
|
||
ARROW_RETURN_NOT_OK(v(stats_)); | ||
|
||
for (const auto& t : subtrees_) { | ||
ARROW_RETURN_NOT_OK(t->Visit(v, m)); | ||
} | ||
|
||
return Status::OK(); | ||
} | ||
|
||
template <typename Visitor> | ||
Status Visit(Visitor&& v) const { | ||
auto always_match = [](const FileStats& t, bool* match) { | ||
*match = true; | ||
return Status::OK(); | ||
}; | ||
return Visit(v, always_match); | ||
} | ||
|
||
bool operator==(const PathTree& other) const { | ||
return stats_ == other.stats_ && subtrees_ == other.subtrees_; | ||
} | ||
|
||
protected: | ||
FileStats stats_; | ||
std::vector<std::shared_ptr<PathTree>> subtrees_; | ||
|
||
// The AddChild method is convenient to create trees in a top-down fashion, | ||
// e.g. the Make factory constructor. | ||
void AddChild(std::shared_ptr<PathTree> child) { | ||
subtrees_.push_back(std::move(child)); | ||
} | ||
}; | ||
|
||
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, | ||
const std::shared_ptr<PathTree>& tree); | ||
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree); | ||
|
||
ARROW_EXPORT bool operator==(const std::shared_ptr<PathTree>& lhs, | ||
const std::shared_ptr<PathTree>& rhs); | ||
|
||
} // namespace fs | ||
} // namespace arrow |
Oops, something went wrong.