From e9a96f7bcdb86b97fd3ccb980fbf5156ef9ee0c7 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Wed, 21 Apr 2021 16:07:25 -0500 Subject: [PATCH] Clean up libgalois algorithms and add TODOs for notable issues. --- .../local_clustering_coefficient.h | 2 + .../louvain_clustering/louvain_clustering.h | 13 +++- .../analytics/random_walks/random_walks.h | 66 ++++++++++++++----- .../subgraph_extraction/subgraph_extraction.h | 13 +++- .../analytics/random_walks/random_walks.cpp | 47 ++++++------- 5 files changed, 98 insertions(+), 43 deletions(-) diff --git a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h index 00b171aaf3..be719b2ba5 100644 --- a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h +++ b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h @@ -45,6 +45,7 @@ class LocalClusteringCoefficientPlan : public Plan { kDefaultRelabeling} {} Algorithm algorithm() const { return algorithm_; } + // TODO(amp): These parameters should be documented. Relabeling relabeling() const { return relabeling_; } bool edges_sorted() const { return edges_sorted_; } @@ -70,6 +71,7 @@ class LocalClusteringCoefficientPlan : public Plan { } }; +// TODO(amp): The doc string was not updated. /** * Count the total number of triangles in the graph. The graph must be * symmetric! diff --git a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h index 6d896e1801..cca8cf0d64 100644 --- a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h +++ b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h @@ -13,7 +13,6 @@ namespace katana::analytics { /// parameters associated with it. class LouvainClusteringPlan : public Plan { public: - /// Algorithm selectors for Single-Source Shortest Path enum Algorithm { kDoAll, }; @@ -53,9 +52,17 @@ class LouvainClusteringPlan : public Plan { public: LouvainClusteringPlan() - : LouvainClusteringPlan{kCPU, kDoAll, false, 0.01, 0.01, 10, 100} {} + : LouvainClusteringPlan{ + kCPU, + kDoAll, + kEnableVF, + kModularityThresholdPerRound, + kModularityThresholdTotal, + kMaxIterations, + kMinGraphSize} {} Algorithm algorithm() const { return algorithm_; } + // TODO(amp): These parameters should be documented. bool is_enable_vf() const { return enable_vf_; } double modularity_threshold_per_round() const { return modularity_threshold_per_round_; @@ -86,7 +93,7 @@ class LouvainClusteringPlan : public Plan { /// Compute the Louvain Clustering for pg. /// The edge weights are taken from the property named /// edge_weight_property_name (which may be a 32- or 64-bit sign or unsigned -/// int), and the computed cluster ids are stored in the property named +/// int), and the computed cluster IDs are stored in the property named /// output_property_name (as uint32_t). /// The property named output_property_name is created by this function and may /// not exist before the call. diff --git a/libgalois/include/katana/analytics/random_walks/random_walks.h b/libgalois/include/katana/analytics/random_walks/random_walks.h index be2a3e03fc..36f9df34fd 100644 --- a/libgalois/include/katana/analytics/random_walks/random_walks.h +++ b/libgalois/include/katana/analytics/random_walks/random_walks.h @@ -14,13 +14,19 @@ namespace katana::analytics { /// A computational plan to for random walks, specifying the algorithm and any /// parameters associated with it. -class RandomWalksPlan : Plan { - enum Algo { node2vec, edge2vec }; - +class RandomWalksPlan : public Plan { public: /// Algorithm selectors for Connected-components enum Algorithm { kNode2Vec, kEdge2Vec }; + static const Algorithm kDefaultAlgorithm = kNode2Vec; + static const uint32_t kDefaultWalkLength = 1; + static const uint32_t kDefaultNumberOfWalks = 1; + constexpr static const double kDefaultBackwardProbability = 1.0; + constexpr static const double kDefaultForwardProbability = 1.0; + static const uint32_t kDefaultMaxIterations = 10; + static const uint32_t kDefaultNumberOfEdgeTypes = 1; + // Don't allow people to directly construct these, so as to have only one // consistent way to configure. private: @@ -50,23 +56,51 @@ class RandomWalksPlan : Plan { number_of_edge_types_(number_of_edge_types) {} public: - // kChunkSize is a fixed const int (default value: 1) + // kChunkSize is fixed at 1 static const int kChunkSize; - RandomWalksPlan() : RandomWalksPlan{kCPU, kNode2Vec, 1, 1, 1.0, 1.0, 10, 1} {} + RandomWalksPlan() + : RandomWalksPlan{ + kCPU, + kDefaultAlgorithm, + kDefaultWalkLength, + kDefaultNumberOfWalks, + kDefaultBackwardProbability, + kDefaultForwardProbability, + kDefaultMaxIterations, + kDefaultNumberOfEdgeTypes} {} Algorithm algorithm() const { return algorithm_; } + + // TODO(amp): The parameters walk_length, number_of_walks, + // backward_probability, and forward_probability control the expected output, + // not the algorithm used to compute the output. So they need to be parameters + // on the algorithm function, not in the plan. The plan should be parameters + // which do not change the expected output (though they may cause selecting a + // different correct output). + + /// Length of random walks. uint32_t walk_length() const { return walk_length_; } + + /// Number of walks per node. uint32_t number_of_walks() const { return number_of_walks_; } + + /// Probability of moving back to parent. double backward_probability() const { return backward_probability_; } + + /// Probability of moving forward (2-hops). double forward_probability() const { return forward_probability_; } + uint32_t max_iterations() const { return max_iterations_; } + uint32_t number_of_edge_types() const { return number_of_edge_types_; } /// Node2Vec algorithm to generate random walks on the graph static RandomWalksPlan Node2Vec( - uint32_t walk_length, uint32_t number_of_walks, - double backward_probability, double forward_probability) { + uint32_t walk_length = kDefaultWalkLength, + uint32_t number_of_walks = kDefaultNumberOfWalks, + double backward_probability = kDefaultBackwardProbability, + double forward_probability = kDefaultBackwardProbability) { return { kCPU, kNode2Vec, @@ -81,9 +115,12 @@ class RandomWalksPlan : Plan { /// Edge2Vec algorithm to generate random walks on the graph. /// Takes the heterogeneity of the edges into account static RandomWalksPlan Edge2Vec( - uint32_t walk_length, uint32_t number_of_walks, - double backward_probability, double forward_probability, - uint32_t max_iterations, uint32_t number_of_edge_types) { + uint32_t walk_length = kDefaultWalkLength, + uint32_t number_of_walks = kDefaultNumberOfWalks, + double backward_probability = kDefaultBackwardProbability, + double forward_probability = kDefaultBackwardProbability, + uint32_t max_iterations = kDefaultMaxIterations, + uint32_t number_of_edge_types = kDefaultNumberOfEdgeTypes) { return { kCPU, kNode2Vec, @@ -96,11 +133,10 @@ class RandomWalksPlan : Plan { } }; -/// Compute the random-walks for pg. The pg is expected to be -/// symmetric. -/// The parameters can be specified, but have reasonable defaults. Not all parameters -/// are used by the algorithms. -/// The generated random-walks generated are return in Katana::InsertBag. +/// Compute the random-walks for pg. The pg is expected to be symmetric. The +/// parameters can be specified, but have reasonable defaults. Not all +/// parameters are used by the algorithms. The generated random-walks generated +/// are returned as a vector of vectors. KATANA_EXPORT Result>> RandomWalks( PropertyGraph* pg, RandomWalksPlan plan = RandomWalksPlan()); diff --git a/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h b/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h index dad961b617..670a6598b3 100644 --- a/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h +++ b/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h @@ -26,6 +26,14 @@ class SubGraphExtractionPlan : public Plan { Algorithm algorithm() const { return algorithm_; } + // TODO(amp): This algorithm defines the semantics of the call. If there were + // an algorithm that, for instance, took a list of edges, that would need to + // be a different function, not just a different plan, since it takes + // semantically different arguments. I do think this should have a plan, even + // if there is only one concrete algorithm, but it should be defined and + // documented in terms of the concrete algorithm, not the semantics of the + // function (which is described well below). + /** * The node-set algorithm: * Given a set of node ids, this algorithm constructs a new sub-graph @@ -41,12 +49,13 @@ class SubGraphExtractionPlan : public Plan { * The new sub-graph is independent of the original graph. * * @param pg The graph to process. - * @param node_vec Set of node ids + * @param node_vec Set of node IDs * @param plan */ KATANA_EXPORT katana::Result> SubGraphExtraction( - katana::PropertyGraph* pg, const std::vector& node_vec, + katana::PropertyGraph* pg, + const std::vector& node_vec, SubGraphExtractionPlan plan = {}); // const std::vector& node_properties_to_copy, const std::vector& edge_properties_to_copy); diff --git a/libgalois/src/analytics/random_walks/random_walks.cpp b/libgalois/src/analytics/random_walks/random_walks.cpp index 3e7e9d71c4..46f481cccc 100644 --- a/libgalois/src/analytics/random_walks/random_walks.cpp +++ b/libgalois/src/analytics/random_walks/random_walks.cpp @@ -44,9 +44,8 @@ struct Node2VecAlgo { return graph.num_nodes(); } double total_wt = degree[n]; - prob = prob * total_wt; - uint32_t edge_index = std::floor(prob); + uint32_t edge_index = std::floor(prob * total_wt); auto edge = graph.edge_begin(n) + edge_index; return *graph.GetEdgeDest(edge); } @@ -79,7 +78,7 @@ struct Node2VecAlgo { uint64_t total_walks = graph.size() * plan_.number_of_walks(); katana::do_all( - katana::iterate((uint64_t)0, total_walks), + katana::iterate(uint64_t(0), total_walks), [&](uint64_t idx) { GNode n = idx % graph.size(); @@ -144,7 +143,7 @@ struct Node2VecAlgo { alpha = prob_forward; } - if (alpha >= y) { + if (y <= alpha) { //accept y walk.push_back(std::move(nbr)); break; @@ -153,7 +152,7 @@ struct Node2VecAlgo { } } - (*walks).push(std::move(walk)); + walks->push(std::move(walk)); }, katana::steal(), katana::chunk_size(), katana::loopname("Node2vec walks"), katana::no_stats()); @@ -234,7 +233,7 @@ struct Edge2VecAlgo { uint64_t total_walks = graph.size() * plan_.number_of_walks(); katana::do_all( - katana::iterate((uint64_t)0, total_walks), + katana::iterate(uint64_t(0), total_walks), [&](uint64_t idx) { GNode n = idx % graph.size(); @@ -338,8 +337,8 @@ struct Edge2VecAlgo { num_edge_types[type]++; } - (*per_thread_num_edge_types_walks.getLocal()) - .emplace_back(std::move(num_edge_types)); + per_thread_num_edge_types_walks.getLocal()->emplace_back( + std::move(num_edge_types)); }); for (unsigned j = 0; j < katana::getActiveThreads(); ++j) { @@ -359,7 +358,7 @@ struct Edge2VecAlgo { transformed_num_edge_types_walks.resize(plan_.number_of_edge_types() + 1); katana::do_all( - katana::iterate((uint32_t)0, plan_.number_of_edge_types() + 1), + katana::iterate(uint32_t(0), plan_.number_of_edge_types() + 1), [&](uint32_t j) { for (uint32_t i = 0; i < rows; i++) { transformed_num_edge_types_walks[j].push_back( @@ -395,8 +394,8 @@ struct Edge2VecAlgo { const std::vector>& transformed_num_edge_types_walks, const std::vector& means) { - std::vector x = transformed_num_edge_types_walks[i]; - std::vector y = transformed_num_edge_types_walks[j]; + const std::vector& x = transformed_num_edge_types_walks[i]; + const std::vector& y = transformed_num_edge_types_walks[j]; double sum = 0.0; double sig1 = 0.0; @@ -425,7 +424,7 @@ struct Edge2VecAlgo { transformed_num_edge_types_walks, const std::vector& means) { katana::do_all( - katana::iterate((uint32_t)1, plan_.number_of_edge_types() + 1), + katana::iterate(uint32_t(1), plan_.number_of_edge_types() + 1), [&](uint32_t i) { for (uint32_t j = 1; j <= plan_.number_of_edge_types(); j++) { double pearson_corr = @@ -468,12 +467,11 @@ struct Edge2VecAlgo { template void InitializeDegrees(const Graph& graph, katana::LargeArray* degree) { - katana::do_all( - katana::iterate(graph), - [&](typename Graph::Node n) { - (*degree)[n] = std::distance(graph.edge_begin(n), graph.edge_end(n)); - }, - katana::steal()); + katana::do_all(katana::iterate(graph), [&](typename Graph::Node n) { + // Treat this as O(1) time because subtracting iterators is just pointer + // or number subtraction. So don't use steal(). + (*degree)[n] = graph.edges(n).size(); + }); } } //namespace @@ -485,6 +483,12 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) { return res.error(); } + // TODO(amp): This is incorrect. For Node2vec this needs to be: + // Algorithm::Graph::Make(pg, {}, {}) // Ignoring all properties. + // For Edge2vec this needs to be: + // Algorithm::Graph::Make(pg, {}, {edge_type_property_name}) + // The current version requires the input to have exactly the properties + // expected by the algorithm implementation. auto pg_result = Algorithm::Graph::Make(pg); if (!pg_result) { return pg_result.error(); @@ -510,11 +514,8 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) { degree.deallocate(); std::vector> walks_in_vector; - walks_in_vector.reserve(pg->num_nodes() * plan.number_of_walks()); - // Copy to vector of vectors. - for (auto walk : walks) { - walks_in_vector.push_back(walk); - } + walks_in_vector.reserve(plan.number_of_walks()); + std::move(walks.begin(), walks.end(), std::back_inserter(walks_in_vector)); return walks_in_vector; }