Skip to content

Commit

Permalink
ARROW-6606: [C++] Add PathTree tree structure
Browse files Browse the repository at this point in the history
Construct tree structure from std::vector<fs::FileStats>, following the
path directory hierarchy.
  • Loading branch information
fsaintjacques committed Sep 26, 2019
1 parent 5b4a08f commit 109ea85
Show file tree
Hide file tree
Showing 9 changed files with 420 additions and 14 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ set(ARROW_SRCS
filesystem/filesystem.cc
filesystem/localfs.cc
filesystem/mockfs.cc
filesystem/path_tree.cc
filesystem/path_util.cc
filesystem/util_internal.cc
io/buffered.cc
Expand Down
1 change: 1 addition & 0 deletions cpp/src/arrow/filesystem/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ arrow_install_all_headers("arrow/filesystem")

add_arrow_test(filesystem_test)
add_arrow_test(localfs_test)
add_arrow_test(path_tree_test)

if(ARROW_S3)
add_arrow_test(s3fs_test)
Expand Down
4 changes: 4 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ std::string FileStats::base_name() const {
return internal::GetAbstractPathParent(path_).second;
}

std::string FileStats::dir_name() const {
return internal::GetAbstractPathParent(path_).first;
}

// Debug helper
std::ostream& operator<<(std::ostream& os, const FileStats& stats) {
return os << "FileStats(" << stats.type() << ", " << stats.path() << ")";
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/filesystem/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ struct ARROW_EXPORT FileStats {
/// The file base name (component after the last directory separator)
std::string base_name() const;

// The directory base name (component before the file base name).
std::string dir_name() const;

/// The size in bytes, if available
///
/// Only regular files are guaranteed to have a size.
Expand All @@ -110,6 +113,9 @@ struct ARROW_EXPORT FileStats {
TimePoint mtime() const { return mtime_; }
void set_mtime(TimePoint mtime) { mtime_ = mtime; }

bool IsFile() const { return type_ == FileType::File; }
bool IsDirectory() const { return type_ == FileType::Directory; }

bool operator==(const FileStats& other) const {
return type() == other.type() && path() == other.path() && size() == other.size() &&
mtime() == other.mtime();
Expand Down
139 changes: 139 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
//

#include "arrow/filesystem/path_tree.h"

#include <algorithm>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>

#include "arrow/filesystem/path_util.h"

namespace arrow {
namespace fs {

static int PathDepth(const std::string& path) {
return std::count(path.begin(), path.end(), internal::kSep);
}

using PathTreeByPathMap = std::unordered_map<std::string, std::shared_ptr<PathTree>>;

template <FileType Type>
bool IsType(const std ::shared_ptr<PathTree>& tree) {
return tree->stats().type() == Type;
}

std::shared_ptr<PathTree> FindAncestor(PathTreeByPathMap* directories, std::string path) {
while (path != "") {
auto parent = internal::GetAbstractPathParent(path).first;
auto found = directories->find(parent);
if (found != directories->end()) {
return found->second;
}

path = parent;
}

return nullptr;
}

static void LinkToParentOrInsertNewRoot(FileStats stats, PathTreeByPathMap* directories,
PathTreeForest* forest) {
auto node = std::make_shared<PathTree>(stats);
if (stats.path() == "") {
forest->push_back(node);
return;
}

auto ancestor = FindAncestor(directories, stats.path());
if (ancestor) {
ancestor->AddChild(node);
} else {
forest->push_back(node);
}

if (IsType<FileType::Directory>(node)) {
directories->insert({stats.path(), node});
}
}

using DirectoryByDepthMap = std::unordered_map<int, std::vector<FileStats>>;

std::vector<int> OrderedDepths(const DirectoryByDepthMap& directories_by_depth) {
std::vector<int> depths;
for (auto k_v : directories_by_depth) {
depths.push_back(k_v.first);
}

// In practice, this is going to be O(lg(n)lg(lg(n))), i.e. constant.
std::sort(depths.begin(), depths.end());
return depths;
}

Status PathTree::Make(std::vector<FileStats> stats, PathTreeForest* out) {
PathTreeByPathMap directories;
PathTreeForest forest;

// Partition the stats vector into (directories, others)
auto is_directory = [](const FileStats& stats) { return stats.IsDirectory(); };
std::stable_partition(stats.begin(), stats.end(), is_directory);
auto mid = std::partition_point(stats.begin(), stats.end(), is_directory);

// First, partition directories by path depth.
DirectoryByDepthMap directories_by_depth;
std::for_each(stats.begin(), mid, [&directories_by_depth](FileStats s) {
directories_by_depth[PathDepth(s.path())].push_back(s);
});

// Insert directories by ascending depth, ensuring that children directories
// are always inserted after ancestors.
for (int d : OrderedDepths(directories_by_depth)) {
auto dir = directories_by_depth.at(d);
std::for_each(dir.begin(), dir.end(), [&directories, &forest](FileStats s) {
LinkToParentOrInsertNewRoot(s, &directories, &forest);
});
}

// Second, ingest files. By the same logic, directories are added before
// files, hence the lookup for ancestors is valid.
std::for_each(mid, stats.end(), [&directories, &forest](FileStats s) {
LinkToParentOrInsertNewRoot(s, &directories, &forest);
});

*out = std::move(forest);
return Status::OK();
}

Status PathTree::Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out) {
PathTreeForest forest;
RETURN_NOT_OK(Make(stats, &forest));

auto size = forest.size();
if (size > 1) {
return Status::Invalid("Requested PathTree has ", size, " roots, but expected 1.");
} else if (size == 1) {
*out = forest[0];
}

return Status::OK();
}

} // namespace fs
} // namespace arrow
88 changes: 88 additions & 0 deletions cpp/src/arrow/filesystem/path_tree.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include "arrow/filesystem/filesystem.h"

#include <algorithm>
#include <memory>
#include <utility>
#include <vector>

namespace arrow {
namespace fs {

class ARROW_EXPORT PathTree;

/// \brief A PathTreeForest consists of multiples PathTree
using PathTreeForest = std::vector<std::shared_ptr<PathTree>>;

/// \brief A PathTree is a utility to transform a vector of FileStats into a
/// forest representation for tree traversal purposes. Node in the graph wraps
/// a FileStats. Files are expected to be found only at leaves of the tree.
class ARROW_EXPORT PathTree {
public:
explicit PathTree(FileStats stats) : stats_(stats) {}
PathTree(FileStats stats, std::vector<std::shared_ptr<PathTree>> subtrees)
: stats_(stats), subtrees_(std::move(subtrees)) {}

/// \brief Transforms a FileStats vector into a forest of trees. Since there
/// is no guarantee of complete trees, it is possible to have a forest
/// (multiple roots). The caller should ensure that stats have unique path.
static Status Make(std::vector<FileStats> stats, PathTreeForest* out);

/// \brief Like MakeForest but fails if there's more than one root.
static Status Make(std::vector<FileStats> stats, std::shared_ptr<PathTree>* out);

/// \brief Returns the FileStat of this node.
FileStats stats() const { return stats_; }
/// \brief Returns the subtrees under this node.
std::vector<std::shared_ptr<PathTree>> subtrees() const { return subtrees_; }

template <typename Visitor>
void Visit(Visitor v) const {
v(stats_);

auto recurse = [&v](const std::shared_ptr<PathTree>& tree) { tree->Visit(v); };
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse);
}

/// \brief Visit with eager pruning.
template <typename Visitor, typename Matcher>
void Visit(Visitor v, Matcher m) const {
if (!m(stats_)) {
return;
}

v(stats_);

auto recurse = [&v, &m](const std::shared_ptr<PathTree>& tree) { tree->Visit(v, m); };
std::for_each(subtrees_.cbegin(), subtrees_.cend(), recurse);
}

void AddChild(std::shared_ptr<PathTree> child) {
subtrees_.push_back(std::move(child));
}

protected:
FileStats stats_;
std::vector<std::shared_ptr<PathTree>> subtrees_;
};

} // namespace fs
} // namespace arrow
Loading

0 comments on commit 109ea85

Please sign in to comment.