Skip to content

Commit

Permalink
ARROW-6606: [C++] Add PathTree tree structure
Browse files Browse the repository at this point in the history
Construct tree structure from std::vector<fs::FileStats>, following the path directory hierarchy.

Closes #5430 from fsaintjacques/ARROW-6606-path-tree and squashes the following commits:

43d19fa <François Saint-Jacques> Address comments
60b5945 <François Saint-Jacques> Simplify implementation
109ea85 <François Saint-Jacques> ARROW-6606:  Add PathTree tree structure

Authored-by: François Saint-Jacques <[email protected]>
Signed-off-by: Benjamin Kietzman <[email protected]>
  • Loading branch information
fsaintjacques authored and bkietz committed Sep 26, 2019
1 parent 2dc020c commit dec0cfb
Show file tree
Hide file tree
Showing 9 changed files with 452 additions and 14 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ set(ARROW_SRCS
filesystem/filesystem.cc
filesystem/localfs.cc
filesystem/mockfs.cc
filesystem/path_tree.cc
filesystem/path_util.cc
filesystem/util_internal.cc
io/buffered.cc
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem")

add_arrow_test(filesystem_test)
add_arrow_test(localfs_test)
add_arrow_test(path_tree_test)

if(ARROW_S3)
add_arrow_test(s3fs_test)
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ std::string FileStats::base_name() const {
return internal::GetAbstractPathParent(path_).second;
}

std::string FileStats::dir_name() const {
return internal::GetAbstractPathParent(path_).first;
}

// Debug helper
std::ostream& operator<<(std::ostream& os, const FileStats& stats) {
return os << "FileStats(" << stats.type() << ", " << stats.path() << ")";
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ struct ARROW_EXPORT FileStats {
/// The file base name (component after the last directory separator)
std::string base_name() const;

// The directory base name (component before the file base name).
std::string dir_name() const;

/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
Expand All @@ -110,6 +113,9 @@ struct ARROW_EXPORT FileStats {
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }

bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }

bool operator==(const FileStats& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
Expand Down
137 changes: 137 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//

#include "arrow/filesystem/path_tree.h"

#include <algorithm>
#include <iostream>
#include <map>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include "arrow/filesystem/path_util.h"

namespace arrow {
namespace fs {

using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>;

std::shared_ptr<PathTree> FindAncestor(const PathTreeByPathMap& directories,
std::string path) {
while (path != "") {
auto parent = internal::GetAbstractPathParent(path).first;
auto found = directories.find(parent);
if (found != directories.end()) {
return found->second;
}

path = std::move(parent);
}

return nullptr;
}

Status PathTree::Make(std::vector<FileStats> stats, PathForest* out) {
PathTreeByPathMap directories;
PathForest forest;

auto link_parent_or_insert_root = [&directories, &forest](const FileStats& s) {
if (s.path() == "") {
return;
}

auto ancestor = FindAncestor(directories, s.path());
auto node = std::make_shared<PathTree>(s);
if (ancestor) {
ancestor->AddChild(node);
} else {
forest.push_back(node);
}

if (s.type() == FileType::Directory) {
directories[s.path()] = node;
}
};

// Insert nodes by ascending path length, ensuring that nodes are always
// inserted after their ancestors. Note that this strategy does not account
// for special directories like '..'. It is expected that path are absolute.
auto cmp = [](const FileStats& lhs, const FileStats& rhs) {
return lhs.path().size() < rhs.path().size();
};
std::stable_sort(stats.begin(), stats.end(), cmp);
std::for_each(stats.cbegin(), stats.cend(), link_parent_or_insert_root);

*out = std::move(forest);
return Status::OK();
}

Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) {
PathForest forest;
RETURN_NOT_OK(Make(stats, &forest));

auto size = forest.size();
if (size > 1) {
return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1.");
} else if (size == 1) {
*out = forest[0];
}

return Status::OK();
}

std::ostream& operator<<(std::ostream& os, const PathTree& tree) {
os << "PathTree(" << tree.stats();

const auto& subtrees = tree.subtrees();
if (subtrees.size()) {
os << ", [";
for (size_t i = 0; i < subtrees.size(); i++) {
if (i != 0) os << ", ";
os << *subtrees[i];
}
os << "]";
}
os << ")";
return os;
}

std::ostream& operator<<(std::ostream& os, const std::shared_ptr<PathTree>& tree) {
if (tree != nullptr) {
return os << *tree.get();
}

return os;
}

bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs) {
if (lhs == NULLPTR && rhs == NULLPTR) {
return true;
} else if (lhs != NULLPTR && rhs != NULLPTR) {
return *lhs == *rhs;
}

return false;
}

} // namespace fs
} // namespace arrow
110 changes: 110 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/filesystem/filesystem.h"

#include <algorithm>
#include <iosfwd>
#include <memory>
#include <utility>
#include <vector>

#include "arrow/status.h"

namespace arrow {
namespace fs {

class ARROW_EXPORT PathTree;

/// \brief A PathForest consists of multiples PathTree
using PathForest = std::vector<std::shared_ptr<PathTree>>;

/// \brief A PathTree is a utility to transform a vector of FileStats into a
/// forest representation for tree traversal purposes. Node in the graph wraps
/// a FileStats. Files are expected to be found only at leaves of the tree.
class ARROW_EXPORT PathTree {
public:
explicit PathTree(FileStats stats) : stats_(stats) {}
PathTree(FileStats stats, std::vector<std::shared_ptr<PathTree>> subtrees)
: stats_(stats), subtrees_(std::move(subtrees)) {}

/// \brief Transforms a FileStats vector into a forest of trees. Since there
/// is no guarantee of complete trees, it is possible to have a forest
/// (multiple roots). The caller should ensure that stats have unique path.
static Status Make(std::vector<FileStats> stats, PathForest* out);

/// \brief Like MakeForest but fails if there's more than one root.
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out);

/// \brief Returns the FileStat of this node.
FileStats stats() const { return stats_; }
/// \brief Returns the subtrees under this node.
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; }

/// \brief Visit with eager pruning.
template <typename Visitor, typename Matcher>
Status Visit(Visitor&& v, Matcher&& m) const {
bool match = false;
ARROW_RETURN_NOT_OK(m(stats_, &match));
if (!match) {
return Status::OK();
}

ARROW_RETURN_NOT_OK(v(stats_));

for (const auto& t : subtrees_) {
ARROW_RETURN_NOT_OK(t->Visit(v, m));
}

return Status::OK();
}

template <typename Visitor>
Status Visit(Visitor&& v) const {
auto always_match = [](const FileStats& t, bool* match) {
*match = true;
return Status::OK();
};
return Visit(v, always_match);
}

bool operator==(const PathTree& other) const {
return stats_ == other.stats_ && subtrees_ == other.subtrees_;
}

protected:
FileStats stats_;
std::vector<std::shared_ptr<PathTree>> subtrees_;

// The AddChild method is convenient to create trees in a top-down fashion,
// e.g. the Make factory constructor.
void AddChild(std::shared_ptr<PathTree> child) {
subtrees_.push_back(std::move(child));
}
};

ARROW_EXPORT std::ostream& operator<<(std::ostream& os,
const std::shared_ptr<PathTree>& tree);
ARROW_EXPORT std::ostream& operator<<(std::ostream& os, const PathTree& tree);

ARROW_EXPORT bool operator==(const std::shared_ptr<PathTree>& lhs,
const std::shared_ptr<PathTree>& rhs);

} // namespace fs
} // namespace arrow
Loading

0 comments on commit dec0cfb

Please sign in to comment.