forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ARROW-6606: [C++] Add PathTree tree structure
Construct tree structure from std::vector<fs::FileStats>, following the path directory hierarchy.
- Loading branch information
1 parent
5b4a08f
commit 109ea85
Showing
9 changed files
with
420 additions
and
14 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
// | ||
|
||
#include "arrow/filesystem/path_tree.h" | ||
|
||
#include <algorithm> | ||
#include <memory> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <utility> | ||
|
||
#include "arrow/filesystem/path_util.h" | ||
|
||
namespace arrow { | ||
namespace fs { | ||
|
||
static int PathDepth(const std::string& path) { | ||
return std::count(path.begin(), path.end(), internal::kSep); | ||
} | ||
|
||
using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>; | ||
|
||
template <FileType Type> | ||
bool IsType(const std ::shared_ptr<PathTree>& tree) { | ||
return tree->stats().type() == Type; | ||
} | ||
|
||
std::shared_ptr<PathTree> FindAncestor(PathTreeByPathMap* directories, std::string path) { | ||
while (path != "") { | ||
auto parent = internal::GetAbstractPathParent(path).first; | ||
auto found = directories->find(parent); | ||
if (found != directories->end()) { | ||
return found->second; | ||
} | ||
|
||
path = parent; | ||
} | ||
|
||
return nullptr; | ||
} | ||
|
||
static void LinkToParentOrInsertNewRoot(FileStats stats, PathTreeByPathMap* directories, | ||
PathTreeForest* forest) { | ||
auto node = std::make_shared<PathTree>(stats); | ||
if (stats.path() == "") { | ||
forest->push_back(node); | ||
return; | ||
} | ||
|
||
auto ancestor = FindAncestor(directories, stats.path()); | ||
if (ancestor) { | ||
ancestor->AddChild(node); | ||
} else { | ||
forest->push_back(node); | ||
} | ||
|
||
if (IsType<FileType::Directory>(node)) { | ||
directories->insert({stats.path(), node}); | ||
} | ||
} | ||
|
||
using DirectoryByDepthMap = std::unordered_map<int, std::vector<FileStats>>; | ||
|
||
std::vector<int> OrderedDepths(const DirectoryByDepthMap& directories_by_depth) { | ||
std::vector<int> depths; | ||
for (auto k_v : directories_by_depth) { | ||
depths.push_back(k_v.first); | ||
} | ||
|
||
// In practice, this is going to be O(lg(n)lg(lg(n))), i.e. constant. | ||
std::sort(depths.begin(), depths.end()); | ||
return depths; | ||
} | ||
|
||
Status PathTree::Make(std::vector<FileStats> stats, PathTreeForest* out) { | ||
PathTreeByPathMap directories; | ||
PathTreeForest forest; | ||
|
||
// Partition the stats vector into (directories, others) | ||
auto is_directory = [](const FileStats& stats) { return stats.IsDirectory(); }; | ||
std::stable_partition(stats.begin(), stats.end(), is_directory); | ||
auto mid = std::partition_point(stats.begin(), stats.end(), is_directory); | ||
|
||
// First, partition directories by path depth. | ||
DirectoryByDepthMap directories_by_depth; | ||
std::for_each(stats.begin(), mid, [&directories_by_depth](FileStats s) { | ||
directories_by_depth[PathDepth(s.path())].push_back(s); | ||
}); | ||
|
||
// Insert directories by ascending depth, ensuring that children directories | ||
// are always inserted after ancestors. | ||
for (int d : OrderedDepths(directories_by_depth)) { | ||
auto dir = directories_by_depth.at(d); | ||
std::for_each(dir.begin(), dir.end(), [&directories, &forest](FileStats s) { | ||
LinkToParentOrInsertNewRoot(s, &directories, &forest); | ||
}); | ||
} | ||
|
||
// Second, ingest files. By the same logic, directories are added before | ||
// files, hence the lookup for ancestors is valid. | ||
std::for_each(mid, stats.end(), [&directories, &forest](FileStats s) { | ||
LinkToParentOrInsertNewRoot(s, &directories, &forest); | ||
}); | ||
|
||
*out = std::move(forest); | ||
return Status::OK(); | ||
} | ||
|
||
Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) { | ||
PathTreeForest forest; | ||
RETURN_NOT_OK(Make(stats, &forest)); | ||
|
||
auto size = forest.size(); | ||
if (size > 1) { | ||
return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1."); | ||
} else if (size == 1) { | ||
*out = forest[0]; | ||
} | ||
|
||
return Status::OK(); | ||
} | ||
|
||
} // namespace fs | ||
} // namespace arrow |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include "arrow/filesystem/filesystem.h" | ||
|
||
#include <algorithm> | ||
#include <memory> | ||
#include <utility> | ||
#include <vector> | ||
|
||
namespace arrow { | ||
namespace fs { | ||
|
||
class ARROW_EXPORT PathTree; | ||
|
||
/// \brief A PathTreeForest consists of multiples PathTree | ||
using PathTreeForest = std::vector<std::shared_ptr<PathTree>>; | ||
|
||
/// \brief A PathTree is a utility to transform a vector of FileStats into a | ||
/// forest representation for tree traversal purposes. Node in the graph wraps | ||
/// a FileStats. Files are expected to be found only at leaves of the tree. | ||
class ARROW_EXPORT PathTree { | ||
public: | ||
explicit PathTree(FileStats stats) : stats_(stats) {} | ||
PathTree(FileStats stats, std::vector<std::shared_ptr<PathTree>> subtrees) | ||
: stats_(stats), subtrees_(std::move(subtrees)) {} | ||
|
||
/// \brief Transforms a FileStats vector into a forest of trees. Since there | ||
/// is no guarantee of complete trees, it is possible to have a forest | ||
/// (multiple roots). The caller should ensure that stats have unique path. | ||
static Status Make(std::vector<FileStats> stats, PathTreeForest* out); | ||
|
||
/// \brief Like MakeForest but fails if there's more than one root. | ||
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out); | ||
|
||
/// \brief Returns the FileStat of this node. | ||
FileStats stats() const { return stats_; } | ||
/// \brief Returns the subtrees under this node. | ||
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; } | ||
|
||
template <typename Visitor> | ||
void Visit(Visitor v) const { | ||
v(stats_); | ||
|
||
auto recurse = [&v](const std::shared_ptr<PathTree>& tree) { tree->Visit(v); }; | ||
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse); | ||
} | ||
|
||
/// \brief Visit with eager pruning. | ||
template <typename Visitor, typename Matcher> | ||
void Visit(Visitor v, Matcher m) const { | ||
if (!m(stats_)) { | ||
return; | ||
} | ||
|
||
v(stats_); | ||
|
||
auto recurse = [&v, &m](const std::shared_ptr<PathTree>& tree) { tree->Visit(v, m); }; | ||
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse); | ||
} | ||
|
||
void AddChild(std::shared_ptr<PathTree> child) { | ||
subtrees_.push_back(std::move(child)); | ||
} | ||
|
||
protected: | ||
FileStats stats_; | ||
std::vector<std::shared_ptr<PathTree>> subtrees_; | ||
}; | ||
|
||
} // namespace fs | ||
} // namespace arrow |
Oops, something went wrong.