From 66e8526b52462ad5a254a9419edeb89fc43a05b8 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Wed, 21 Apr 2021 16:07:25 -0500 Subject: [PATCH 1/6] Clean up libgalois algorithms and add TODOs for notable issues. --- .../local_clustering_coefficient.h | 2 + .../louvain_clustering/louvain_clustering.h | 13 +++- .../analytics/random_walks/random_walks.h | 68 ++++++++++++++----- .../subgraph_extraction/subgraph_extraction.h | 13 +++- .../analytics/random_walks/random_walks.cpp | 47 ++++++------- 5 files changed, 99 insertions(+), 44 deletions(-) diff --git a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h index 00b171aaf3..be719b2ba5 100644 --- a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h +++ b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h @@ -45,6 +45,7 @@ class LocalClusteringCoefficientPlan : public Plan { kDefaultRelabeling} {} Algorithm algorithm() const { return algorithm_; } + // TODO(amp): These parameters should be documented. Relabeling relabeling() const { return relabeling_; } bool edges_sorted() const { return edges_sorted_; } @@ -70,6 +71,7 @@ class LocalClusteringCoefficientPlan : public Plan { } }; +// TODO(amp): The doc string was not updated. /** * Count the total number of triangles in the graph. The graph must be * symmetric! diff --git a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h index 6d896e1801..cca8cf0d64 100644 --- a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h +++ b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h @@ -13,7 +13,6 @@ namespace katana::analytics { /// parameters associated with it. class LouvainClusteringPlan : public Plan { public: - /// Algorithm selectors for Single-Source Shortest Path enum Algorithm { kDoAll, }; @@ -53,9 +52,17 @@ class LouvainClusteringPlan : public Plan { public: LouvainClusteringPlan() - : LouvainClusteringPlan{kCPU, kDoAll, false, 0.01, 0.01, 10, 100} {} + : LouvainClusteringPlan{ + kCPU, + kDoAll, + kEnableVF, + kModularityThresholdPerRound, + kModularityThresholdTotal, + kMaxIterations, + kMinGraphSize} {} Algorithm algorithm() const { return algorithm_; } + // TODO(amp): These parameters should be documented. bool is_enable_vf() const { return enable_vf_; } double modularity_threshold_per_round() const { return modularity_threshold_per_round_; @@ -86,7 +93,7 @@ class LouvainClusteringPlan : public Plan { /// Compute the Louvain Clustering for pg. /// The edge weights are taken from the property named /// edge_weight_property_name (which may be a 32- or 64-bit sign or unsigned -/// int), and the computed cluster ids are stored in the property named +/// int), and the computed cluster IDs are stored in the property named /// output_property_name (as uint32_t). /// The property named output_property_name is created by this function and may /// not exist before the call. diff --git a/libgalois/include/katana/analytics/random_walks/random_walks.h b/libgalois/include/katana/analytics/random_walks/random_walks.h index be2a3e03fc..de30b4c8e0 100644 --- a/libgalois/include/katana/analytics/random_walks/random_walks.h +++ b/libgalois/include/katana/analytics/random_walks/random_walks.h @@ -14,13 +14,19 @@ namespace katana::analytics { /// A computational plan to for random walks, specifying the algorithm and any /// parameters associated with it. -class RandomWalksPlan : Plan { - enum Algo { node2vec, edge2vec }; - +class RandomWalksPlan : public Plan { public: - /// Algorithm selectors for Connected-components + /// Algorithm selectors for random walks enum Algorithm { kNode2Vec, kEdge2Vec }; + static const Algorithm kDefaultAlgorithm = kNode2Vec; + static const uint32_t kDefaultWalkLength = 1; + static const uint32_t kDefaultNumberOfWalks = 1; + constexpr static const double kDefaultBackwardProbability = 1.0; + constexpr static const double kDefaultForwardProbability = 1.0; + static const uint32_t kDefaultMaxIterations = 10; + static const uint32_t kDefaultNumberOfEdgeTypes = 1; + // Don't allow people to directly construct these, so as to have only one // consistent way to configure. private: @@ -50,23 +56,51 @@ class RandomWalksPlan : Plan { number_of_edge_types_(number_of_edge_types) {} public: - // kChunkSize is a fixed const int (default value: 1) + // kChunkSize is fixed at 1 static const int kChunkSize; - RandomWalksPlan() : RandomWalksPlan{kCPU, kNode2Vec, 1, 1, 1.0, 1.0, 10, 1} {} + RandomWalksPlan() + : RandomWalksPlan{ + kCPU, + kDefaultAlgorithm, + kDefaultWalkLength, + kDefaultNumberOfWalks, + kDefaultBackwardProbability, + kDefaultForwardProbability, + kDefaultMaxIterations, + kDefaultNumberOfEdgeTypes} {} Algorithm algorithm() const { return algorithm_; } + + // TODO(amp): The parameters walk_length, number_of_walks, + // backward_probability, and forward_probability control the expected output, + // not the algorithm used to compute the output. So they need to be parameters + // on the algorithm function, not in the plan. The plan should be parameters + // which do not change the expected output (though they may cause selecting a + // different correct output). + + /// Length of random walks. uint32_t walk_length() const { return walk_length_; } + + /// Number of walks per node. uint32_t number_of_walks() const { return number_of_walks_; } + + /// Probability of moving back to parent. double backward_probability() const { return backward_probability_; } + + /// Probability of moving forward (2-hops). double forward_probability() const { return forward_probability_; } + uint32_t max_iterations() const { return max_iterations_; } + uint32_t number_of_edge_types() const { return number_of_edge_types_; } /// Node2Vec algorithm to generate random walks on the graph static RandomWalksPlan Node2Vec( - uint32_t walk_length, uint32_t number_of_walks, - double backward_probability, double forward_probability) { + uint32_t walk_length = kDefaultWalkLength, + uint32_t number_of_walks = kDefaultNumberOfWalks, + double backward_probability = kDefaultBackwardProbability, + double forward_probability = kDefaultBackwardProbability) { return { kCPU, kNode2Vec, @@ -81,9 +115,12 @@ class RandomWalksPlan : Plan { /// Edge2Vec algorithm to generate random walks on the graph. /// Takes the heterogeneity of the edges into account static RandomWalksPlan Edge2Vec( - uint32_t walk_length, uint32_t number_of_walks, - double backward_probability, double forward_probability, - uint32_t max_iterations, uint32_t number_of_edge_types) { + uint32_t walk_length = kDefaultWalkLength, + uint32_t number_of_walks = kDefaultNumberOfWalks, + double backward_probability = kDefaultBackwardProbability, + double forward_probability = kDefaultBackwardProbability, + uint32_t max_iterations = kDefaultMaxIterations, + uint32_t number_of_edge_types = kDefaultNumberOfEdgeTypes) { return { kCPU, kNode2Vec, @@ -96,11 +133,10 @@ class RandomWalksPlan : Plan { } }; -/// Compute the random-walks for pg. The pg is expected to be -/// symmetric. -/// The parameters can be specified, but have reasonable defaults. Not all parameters -/// are used by the algorithms. -/// The generated random-walks generated are return in Katana::InsertBag. +/// Compute the random-walks for pg. The pg is expected to be symmetric. The +/// parameters can be specified, but have reasonable defaults. Not all +/// parameters are used by the algorithms. The generated random-walks generated +/// are returned as a vector of vectors. KATANA_EXPORT Result>> RandomWalks( PropertyGraph* pg, RandomWalksPlan plan = RandomWalksPlan()); diff --git a/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h b/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h index dad961b617..670a6598b3 100644 --- a/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h +++ b/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h @@ -26,6 +26,14 @@ class SubGraphExtractionPlan : public Plan { Algorithm algorithm() const { return algorithm_; } + // TODO(amp): This algorithm defines the semantics of the call. If there were + // an algorithm that, for instance, took a list of edges, that would need to + // be a different function, not just a different plan, since it takes + // semantically different arguments. I do think this should have a plan, even + // if there is only one concrete algorithm, but it should be defined and + // documented in terms of the concrete algorithm, not the semantics of the + // function (which is described well below). + /** * The node-set algorithm: * Given a set of node ids, this algorithm constructs a new sub-graph @@ -41,12 +49,13 @@ class SubGraphExtractionPlan : public Plan { * The new sub-graph is independent of the original graph. * * @param pg The graph to process. - * @param node_vec Set of node ids + * @param node_vec Set of node IDs * @param plan */ KATANA_EXPORT katana::Result> SubGraphExtraction( - katana::PropertyGraph* pg, const std::vector& node_vec, + katana::PropertyGraph* pg, + const std::vector& node_vec, SubGraphExtractionPlan plan = {}); // const std::vector& node_properties_to_copy, const std::vector& edge_properties_to_copy); diff --git a/libgalois/src/analytics/random_walks/random_walks.cpp b/libgalois/src/analytics/random_walks/random_walks.cpp index 3e7e9d71c4..46f481cccc 100644 --- a/libgalois/src/analytics/random_walks/random_walks.cpp +++ b/libgalois/src/analytics/random_walks/random_walks.cpp @@ -44,9 +44,8 @@ struct Node2VecAlgo { return graph.num_nodes(); } double total_wt = degree[n]; - prob = prob * total_wt; - uint32_t edge_index = std::floor(prob); + uint32_t edge_index = std::floor(prob * total_wt); auto edge = graph.edge_begin(n) + edge_index; return *graph.GetEdgeDest(edge); } @@ -79,7 +78,7 @@ struct Node2VecAlgo { uint64_t total_walks = graph.size() * plan_.number_of_walks(); katana::do_all( - katana::iterate((uint64_t)0, total_walks), + katana::iterate(uint64_t(0), total_walks), [&](uint64_t idx) { GNode n = idx % graph.size(); @@ -144,7 +143,7 @@ struct Node2VecAlgo { alpha = prob_forward; } - if (alpha >= y) { + if (y <= alpha) { //accept y walk.push_back(std::move(nbr)); break; @@ -153,7 +152,7 @@ struct Node2VecAlgo { } } - (*walks).push(std::move(walk)); + walks->push(std::move(walk)); }, katana::steal(), katana::chunk_size(), katana::loopname("Node2vec walks"), katana::no_stats()); @@ -234,7 +233,7 @@ struct Edge2VecAlgo { uint64_t total_walks = graph.size() * plan_.number_of_walks(); katana::do_all( - katana::iterate((uint64_t)0, total_walks), + katana::iterate(uint64_t(0), total_walks), [&](uint64_t idx) { GNode n = idx % graph.size(); @@ -338,8 +337,8 @@ struct Edge2VecAlgo { num_edge_types[type]++; } - (*per_thread_num_edge_types_walks.getLocal()) - .emplace_back(std::move(num_edge_types)); + per_thread_num_edge_types_walks.getLocal()->emplace_back( + std::move(num_edge_types)); }); for (unsigned j = 0; j < katana::getActiveThreads(); ++j) { @@ -359,7 +358,7 @@ struct Edge2VecAlgo { transformed_num_edge_types_walks.resize(plan_.number_of_edge_types() + 1); katana::do_all( - katana::iterate((uint32_t)0, plan_.number_of_edge_types() + 1), + katana::iterate(uint32_t(0), plan_.number_of_edge_types() + 1), [&](uint32_t j) { for (uint32_t i = 0; i < rows; i++) { transformed_num_edge_types_walks[j].push_back( @@ -395,8 +394,8 @@ struct Edge2VecAlgo { const std::vector>& transformed_num_edge_types_walks, const std::vector& means) { - std::vector x = transformed_num_edge_types_walks[i]; - std::vector y = transformed_num_edge_types_walks[j]; + const std::vector& x = transformed_num_edge_types_walks[i]; + const std::vector& y = transformed_num_edge_types_walks[j]; double sum = 0.0; double sig1 = 0.0; @@ -425,7 +424,7 @@ struct Edge2VecAlgo { transformed_num_edge_types_walks, const std::vector& means) { katana::do_all( - katana::iterate((uint32_t)1, plan_.number_of_edge_types() + 1), + katana::iterate(uint32_t(1), plan_.number_of_edge_types() + 1), [&](uint32_t i) { for (uint32_t j = 1; j <= plan_.number_of_edge_types(); j++) { double pearson_corr = @@ -468,12 +467,11 @@ struct Edge2VecAlgo { template void InitializeDegrees(const Graph& graph, katana::LargeArray* degree) { - katana::do_all( - katana::iterate(graph), - [&](typename Graph::Node n) { - (*degree)[n] = std::distance(graph.edge_begin(n), graph.edge_end(n)); - }, - katana::steal()); + katana::do_all(katana::iterate(graph), [&](typename Graph::Node n) { + // Treat this as O(1) time because subtracting iterators is just pointer + // or number subtraction. So don't use steal(). + (*degree)[n] = graph.edges(n).size(); + }); } } //namespace @@ -485,6 +483,12 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) { return res.error(); } + // TODO(amp): This is incorrect. For Node2vec this needs to be: + // Algorithm::Graph::Make(pg, {}, {}) // Ignoring all properties. + // For Edge2vec this needs to be: + // Algorithm::Graph::Make(pg, {}, {edge_type_property_name}) + // The current version requires the input to have exactly the properties + // expected by the algorithm implementation. auto pg_result = Algorithm::Graph::Make(pg); if (!pg_result) { return pg_result.error(); @@ -510,11 +514,8 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) { degree.deallocate(); std::vector> walks_in_vector; - walks_in_vector.reserve(pg->num_nodes() * plan.number_of_walks()); - // Copy to vector of vectors. - for (auto walk : walks) { - walks_in_vector.push_back(walk); - } + walks_in_vector.reserve(plan.number_of_walks()); + std::move(walks.begin(), walks.end(), std::back_inserter(walks_in_vector)); return walks_in_vector; } From 06c024741ba858504f3a2af408e3d68f73c35318 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Sat, 24 Apr 2021 18:15:16 -0500 Subject: [PATCH 2/6] Implement move semantics for TemporaryPropertyGuard. This enables the use of std collections of TemporaryPropertyGuards --- libgalois/include/katana/analytics/Utils.h | 58 ++++++++++++++++------ libgalois/src/analytics/Utils.cpp | 3 ++ 2 files changed, 46 insertions(+), 15 deletions(-) diff --git a/libgalois/include/katana/analytics/Utils.h b/libgalois/include/katana/analytics/Utils.h index 8626975bb6..a323d15833 100644 --- a/libgalois/include/katana/analytics/Utils.h +++ b/libgalois/include/katana/analytics/Utils.h @@ -3,6 +3,7 @@ #include #include +#include #include "katana/ErrorCode.h" #include "katana/Properties.h" @@ -72,38 +73,65 @@ ConstructEdgeProperties( } class TemporaryPropertyGuard { - katana::PropertyGraph* pfg_; + static thread_local int temporary_property_counter; + + katana::PropertyGraph* pg_{nullptr}; std::string name_; std::string GetPropertyName() { - // Use this as part of the property name since this will delete the property - // when it is deconstructed so this name should be unique at any given time. + // Use a thread local counter and the thread ID to get a unique name. + // `this` is not unique because we support moves. return fmt::format( - "__katana_temporary_property_{}", reinterpret_cast(this)); + "__katana_temporary_property_{}_{}", std::this_thread::get_id(), + temporary_property_counter++); } + void Deinit() { + if (!pg_) { + return; + } + + if (auto r = pg_->RemoveNodeProperty(name_); !r) { + if (r.error() != ErrorCode::PropertyNotFound) { + // Log an error if something goes wrong other than the property not + // existing. + KATANA_LOG_WARN("Failed to remove temporary property: {}", r.error()); + } + } + Clear(); + } + + void Clear() { pg_ = nullptr; } + public: + TemporaryPropertyGuard() = default; + TemporaryPropertyGuard(PropertyGraph* pg, std::string name) - : pfg_(pg), name_(name) {} + : pg_(pg), name_(std::move(name)) {} - explicit TemporaryPropertyGuard(katana::PropertyGraph* pg) + explicit TemporaryPropertyGuard(PropertyGraph* pg) : TemporaryPropertyGuard(pg, GetPropertyName()) {} const TemporaryPropertyGuard& operator=(const TemporaryPropertyGuard&) = delete; TemporaryPropertyGuard(const TemporaryPropertyGuard&) = delete; - std::string name() const { return name_; } + TemporaryPropertyGuard(TemporaryPropertyGuard&& rhs) noexcept + : pg_(rhs.pg_), name_(std::move(rhs.name_)) { + rhs.Clear(); + } - ~TemporaryPropertyGuard() { - if (auto r = pfg_->RemoveNodeProperty(name_); !r) { - if (r.error() != katana::ErrorCode::PropertyNotFound) { - // Log an error if something goes wrong other than the property not - // existing. - KATANA_LOG_WARN("Failed to remove temporary property: {}", r.error()); - } - } + TemporaryPropertyGuard& operator=(TemporaryPropertyGuard&& rhs) noexcept { + Deinit(); + pg_ = rhs.pg_; + name_ = std::move(rhs.name_); + rhs.Clear(); + return *this; } + + std::string name() const { return name_; } + + ~TemporaryPropertyGuard() { Deinit(); } }; } // namespace katana::analytics diff --git a/libgalois/src/analytics/Utils.cpp b/libgalois/src/analytics/Utils.cpp index edf73dfd7c..4eaebc0d81 100644 --- a/libgalois/src/analytics/Utils.cpp +++ b/libgalois/src/analytics/Utils.cpp @@ -63,3 +63,6 @@ katana::analytics::IsApproximateDegreeDistributionPowerLaw( autoAlgoTimer.stop(); return sample_average / 1.3 > sample_median; } + +thread_local int + katana::analytics::TemporaryPropertyGuard::temporary_property_counter = 0; From 80300a386d100b7fe85e55fbd05b58a52d13c9f2 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Thu, 22 Apr 2021 18:04:09 -0500 Subject: [PATCH 3/6] libgalois: Clean up analytics algorithms. * Avoid leaking properties in the error case. * Create output in the correct graph * Improve docs --- .../katana/analytics/k_truss/k_truss.h | 2 + .../local_clustering_coefficient.h | 29 ++++++------ .../louvain_clustering/louvain_clustering.h | 44 +++++++++---------- .../local_clustering_coefficient.cpp | 22 +++++----- .../louvain_clustering/louvain_clustering.cpp | 31 ++++++------- .../local_clustering_coefficient_cli.cpp | 7 ++- 6 files changed, 66 insertions(+), 69 deletions(-) diff --git a/libgalois/include/katana/analytics/k_truss/k_truss.h b/libgalois/include/katana/analytics/k_truss/k_truss.h index 7cf11641b0..f21beb0124 100644 --- a/libgalois/include/katana/analytics/k_truss/k_truss.h +++ b/libgalois/include/katana/analytics/k_truss/k_truss.h @@ -47,6 +47,8 @@ class KTrussPlan : public Plan { /// but have reasonable defaults. /// The property named output_property_name is created by this function and may /// not exist before the call. +/// +/// @warning This algorithm will reorder nodes and edges in the graph. KATANA_EXPORT Result KTruss( PropertyGraph* pg, uint32_t k_truss_number, const std::string& output_property_name, KTrussPlan plan = KTrussPlan()); diff --git a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h index be719b2ba5..76c6d92409 100644 --- a/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h +++ b/libgalois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h @@ -23,31 +23,31 @@ class LocalClusteringCoefficientPlan : public Plan { }; static const Relabeling kDefaultRelabeling = kAutoRelabel; - static const bool kDefaultEdgeSorted = false; + static const bool kDefaultEdgesSorted = false; private: Algorithm algorithm_; - Relabeling relabeling_; bool edges_sorted_; + Relabeling relabeling_; LocalClusteringCoefficientPlan( Architecture architecture, Algorithm algorithm, bool edges_sorted, Relabeling relabeling) : Plan(architecture), algorithm_(algorithm), - relabeling_(relabeling), - edges_sorted_(edges_sorted) {} + edges_sorted_(edges_sorted), + relabeling_(relabeling) {} public: LocalClusteringCoefficientPlan() : LocalClusteringCoefficientPlan{ - kCPU, kOrderedCountPerThread, kDefaultEdgeSorted, + kCPU, kOrderedCountPerThread, kDefaultEdgesSorted, kDefaultRelabeling} {} Algorithm algorithm() const { return algorithm_; } // TODO(amp): These parameters should be documented. - Relabeling relabeling() const { return relabeling_; } bool edges_sorted() const { return edges_sorted_; } + Relabeling relabeling() const { return relabeling_; } /** * An ordered count algorithm that sorts the nodes by degree before @@ -58,29 +58,28 @@ class LocalClusteringCoefficientPlan : public Plan { * @param edges_sorted Are the edges of the graph already sorted. * @param relabeling Should the algorithm relabel the nodes. */ - static LocalClusteringCoefficientPlan LocalClusteringCoefficientAtomics( - bool edges_sorted = kDefaultEdgeSorted, + static LocalClusteringCoefficientPlan OrderedCountAtomics( + bool edges_sorted = kDefaultEdgesSorted, Relabeling relabeling = kDefaultRelabeling) { return {kCPU, kOrderedCountAtomics, edges_sorted, relabeling}; } - static LocalClusteringCoefficientPlan LocalClusteringCoefficientPerThread( - bool edges_sorted = kDefaultEdgeSorted, + static LocalClusteringCoefficientPlan OrderedCountPerThread( + bool edges_sorted = kDefaultEdgesSorted, Relabeling relabeling = kDefaultRelabeling) { return {kCPU, kOrderedCountPerThread, edges_sorted, relabeling}; } }; -// TODO(amp): The doc string was not updated. /** - * Count the total number of triangles in the graph. The graph must be - * symmetric! - * - * This algorithm copies the graph internally. + * Compute the local clustering coefficient for each node in the graph. + * The graph must be symmetric! * * @param pg The graph to process. * @param output_property_name name of the output property * @param plan + * + * @warning This algorithm will reorder nodes and edges in the graph. */ KATANA_EXPORT Result LocalClusteringCoefficient( PropertyGraph* pg, const std::string& output_property_name, diff --git a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h index cca8cf0d64..4254af348b 100644 --- a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h +++ b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h @@ -17,25 +17,20 @@ class LouvainClusteringPlan : public Plan { kDoAll, }; - static const bool kEnableVF = false; - static constexpr double kModularityThresholdPerRound = 0.01; - static constexpr double kModularityThresholdTotal = 0.01; - static const uint32_t kMaxIterations = 10; - static const uint32_t kMinGraphSize = 100; + static const bool kDefaultEnableVF = false; + static constexpr double kDefaultModularityThresholdPerRound = 0.01; + static constexpr double kDefaultModularityThresholdTotal = 0.01; + static const uint32_t kDefaultMaxIterations = 10; + static const uint32_t kDefaultMinGraphSize = 100; // Don't allow people to directly construct these, so as to have only one // consistent way to configure. private: Algorithm algorithm_; - //Flag to enable vertex following optimization. bool enable_vf_; - //Threshold for modularity gain per round. double modularity_threshold_per_round_; - //Threshold for overall modularity gain. double modularity_threshold_total_; - //Maximum number of iterations to execute. uint32_t max_iterations_; - //Minimum coarsened graph size uint32_t min_graph_size_; LouvainClusteringPlan( @@ -55,30 +50,35 @@ class LouvainClusteringPlan : public Plan { : LouvainClusteringPlan{ kCPU, kDoAll, - kEnableVF, - kModularityThresholdPerRound, - kModularityThresholdTotal, - kMaxIterations, - kMinGraphSize} {} + kDefaultEnableVF, + kDefaultModularityThresholdPerRound, + kDefaultModularityThresholdTotal, + kDefaultMaxIterations, + kDefaultMinGraphSize} {} Algorithm algorithm() const { return algorithm_; } - // TODO(amp): These parameters should be documented. - bool is_enable_vf() const { return enable_vf_; } + /// Enable vertex following optimization + bool enable_vf() const { return enable_vf_; } + /// Threshold for modularity gain per round. double modularity_threshold_per_round() const { return modularity_threshold_per_round_; } + /// Threshold for overall modularity gain. double modularity_threshold_total() const { return modularity_threshold_total_; } + /// Maximum number of iterations to execute. uint32_t max_iterations() const { return max_iterations_; } + /// Minimum coarsened graph size uint32_t min_graph_size() const { return min_graph_size_; } static LouvainClusteringPlan DoAll( - bool enable_vf = kEnableVF, - double modularity_threshold_per_round = kModularityThresholdPerRound, - double modularity_threshold_total = kModularityThresholdTotal, - uint32_t max_iterations = kMaxIterations, - uint32_t min_graph_size = kMinGraphSize) { + bool enable_vf = kDefaultEnableVF, + double modularity_threshold_per_round = + kDefaultModularityThresholdPerRound, + double modularity_threshold_total = kDefaultModularityThresholdTotal, + uint32_t max_iterations = kDefaultMaxIterations, + uint32_t min_graph_size = kDefaultMinGraphSize) { return { kCPU, kDoAll, diff --git a/libgalois/src/analytics/local_clustering_coefficient/local_clustering_coefficient.cpp b/libgalois/src/analytics/local_clustering_coefficient/local_clustering_coefficient.cpp index 9e3ee8173a..f5a4baadde 100644 --- a/libgalois/src/analytics/local_clustering_coefficient/local_clustering_coefficient.cpp +++ b/libgalois/src/analytics/local_clustering_coefficient/local_clustering_coefficient.cpp @@ -311,16 +311,18 @@ katana::analytics::LocalClusteringCoefficient( return katana::ErrorCode::AssertionFailed; } - std::unique_ptr mutable_pfg; - if (relabel || !plan.edges_sorted()) { - // Copy the graph so we don't mutate the users graph. - auto mutable_pfg_result = pg->Copy({}, {}); - if (!mutable_pfg_result) { - return mutable_pfg_result.error(); - } - mutable_pfg = std::move(mutable_pfg_result.value()); - pg = mutable_pfg.get(); - } + // std::unique_ptr mutable_pfg; + // if (relabel || !plan.edges_sorted()) { + // // Copy the graph so we don't mutate the users graph. + // auto mutable_pfg_result = pg->Copy({}, {}); + // if (!mutable_pfg_result) { + // return mutable_pfg_result.error(); + // } + // mutable_pfg = std::move(mutable_pfg_result.value()); + // pg = mutable_pfg.get(); + // } + + // TODO(amp): Don't mutate the users topology! if (relabel) { katana::StatTimer timer_relabel( diff --git a/libgalois/src/analytics/louvain_clustering/louvain_clustering.cpp b/libgalois/src/analytics/louvain_clustering/louvain_clustering.cpp index 2b043263eb..e3ec6d8b3c 100644 --- a/libgalois/src/analytics/louvain_clustering/louvain_clustering.cpp +++ b/libgalois/src/analytics/louvain_clustering/louvain_clustering.cpp @@ -229,7 +229,7 @@ struct LouvainClusteringImplementation /* * Vertex following optimization */ - if (plan.is_enable_vf()) { + if (plan.enable_vf()) { Base::VertexFollowing(&graph_curr); // Find nodes that follow other nodes uint64_t num_unique_clusters = @@ -314,7 +314,7 @@ struct LouvainClusteringImplementation if (iter < plan.max_iterations() && (curr_mod - prev_mod) > plan.modularity_threshold_total()) { - if (!plan.is_enable_vf() && phase == 1) { + if (!plan.enable_vf() && phase == 1) { KATANA_LOG_DEBUG_ASSERT(num_nodes_orig == graph_curr.num_nodes()); katana::do_all(katana::iterate(graph_curr), [&](GNode n) { clusters_orig[n] = @@ -361,12 +361,17 @@ LouvainClusteringWithWrap( std::is_integral_v || std::is_floating_point_v); - //! The property name with prefixed with "_katana_temporary_property" - //! are reserved for internal use only. - std::vector temp_node_property_names = { - "_katana_temporary_property_CurrentId", - "_katana_temporary_property_PreviousId", - "_katana_temporary_property_DegreeWt"}; + std::vector temp_node_properties(3); + std::generate_n( + temp_node_properties.begin(), temp_node_properties.size(), + [&]() { return TemporaryPropertyGuard{pfg}; }); + std::vector temp_node_property_names( + temp_node_properties.size()); + std::transform( + temp_node_properties.begin(), temp_node_properties.end(), + temp_node_property_names.begin(), + [](const TemporaryPropertyGuard& p) { return p.name(); }); + using Impl = LouvainClusteringImplementation; if (auto result = ConstructNodeProperties( pfg, temp_node_property_names); @@ -389,16 +394,6 @@ LouvainClusteringWithWrap( return r.error(); } - // // Remove all the existing node/edge properties - // if (auto r = pfg->RemoveAllNodeProperties(); !r) { - // return r.error(); - // } - for (auto property : temp_node_property_names) { - if (auto r = pfg->RemoveNodeProperty(property); !r) { - return r.error(); - } - } - if (auto r = ConstructNodeProperties>( pfg, {output_property_name}); !r) { diff --git a/lonestar/analytics/cpu/local_clustering_coefficient/local_clustering_coefficient_cli.cpp b/lonestar/analytics/cpu/local_clustering_coefficient/local_clustering_coefficient_cli.cpp index 5a408071c8..5461c434d8 100644 --- a/lonestar/analytics/cpu/local_clustering_coefficient/local_clustering_coefficient_cli.cpp +++ b/lonestar/analytics/cpu/local_clustering_coefficient/local_clustering_coefficient_cli.cpp @@ -74,12 +74,11 @@ main(int argc, char** argv) { switch (algo) { case LocalClusteringCoefficientPlan::kOrderedCountAtomics: - plan = LocalClusteringCoefficientPlan::LocalClusteringCoefficientAtomics( - relabeling_flag); + plan = LocalClusteringCoefficientPlan::OrderedCountAtomics(relabeling_flag); break; case LocalClusteringCoefficientPlan::kOrderedCountPerThread: - plan = LocalClusteringCoefficientPlan::LocalClusteringCoefficientPerThread( - relabeling_flag); + plan = + LocalClusteringCoefficientPlan::OrderedCountPerThread(relabeling_flag); break; default: std::cerr << "Unknown algo: " << algo << "\n"; From 3213071b30ddc91a2781c65595d64c9149122846 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Tue, 27 Apr 2021 10:59:10 -0500 Subject: [PATCH 4/6] libgalois: Fix correctness bugs in subgraph extraction. * Was reordering edges passed in. * Wasn't correctly generating CSR. * Was using source graph edge IDs instead of output edge IDs. --- .../subgraph_extraction.cpp | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/libgalois/src/analytics/subgraph_extraction/subgraph_extraction.cpp b/libgalois/src/analytics/subgraph_extraction/subgraph_extraction.cpp index 095c0400f7..7506adb00d 100644 --- a/libgalois/src/analytics/subgraph_extraction/subgraph_extraction.cpp +++ b/libgalois/src/analytics/subgraph_extraction/subgraph_extraction.cpp @@ -52,18 +52,16 @@ SubGraphNodeSet( uint32_t src = node_set[n]; auto last = graph->edges(src).end(); - for (auto dest : node_set) { + for (uint32_t m = 0; m < num_nodes; ++m) { + auto dest = node_set[m]; // Binary search on the edges sorted by destination id - auto lower_bound = katana::FindEdgeSortedByDest(graph, src, dest); - if (lower_bound != *last) { - while (lower_bound != *last && - *graph->GetEdgeDest(lower_bound) == dest) { - subgraph_edges[n].push_back(dest); - lower_bound++; - } + auto edge_id = katana::FindEdgeSortedByDest(graph, src, dest); + while (edge_id != *last && *graph->GetEdgeDest(edge_id) == dest) { + subgraph_edges[n].push_back(m); + edge_id++; } - (*out_indices)[n] = subgraph_edges[n].size(); } + (*out_indices)[n] = subgraph_edges[n].size(); }, katana::steal(), katana::no_stats(), katana::loopname("SubgraphExtraction")); @@ -73,6 +71,7 @@ SubGraphNodeSet( (*out_indices)[i] += (*out_indices)[i - 1]; } uint64_t num_edges = (*out_indices)[num_nodes - 1]; + // Subgraph topology : out dests auto out_dests = std::make_unique>(); out_dests->allocateInterleaved(num_edges); @@ -80,7 +79,7 @@ SubGraphNodeSet( katana::do_all( katana::iterate(uint32_t(0), uint32_t(num_nodes)), [&](const uint32_t& n) { - uint64_t offset = (*out_indices)[n]; + uint64_t offset = n == 0 ? 0 : (*out_indices)[n - 1]; for (uint32_t dest : subgraph_edges[n]) { (*out_dests)[offset] = dest; offset++; @@ -88,6 +87,14 @@ SubGraphNodeSet( }, katana::no_stats(), katana::loopname("ConstructTopology")); + // TODO(amp): The pattern out_indices.release()->data() is leaking both the + // LargeBuffer instance (just a few bytes), AND the buffer itself. The + // instance is leaking because of the call to release without passing + // ownership of the instance to some other object. The buffer is leaking + // because arrow::MutableBuffer does not own it's data, so it will never + // deallocate the buffer passed to arrow::MutableBuffer::Wrap. + // This pattern probably exists elsewhere in the code. + // Set new topology auto numeric_array_out_indices = std::make_shared>( @@ -122,10 +129,11 @@ katana::analytics::SubGraphExtraction( // Remove duplicates from the node vector std::unordered_set set; std::vector dedup_node_vec; - for (auto i : node_vec) { - set.insert(i); + for (auto n : node_vec) { + if (set.insert(n).second) { // If n wasn't already present. + dedup_node_vec.push_back(n); + } } - dedup_node_vec.assign(set.begin(), set.end()); katana::StatTimer execTime("SubGraph-Extraction"); switch (plan.algorithm()) { From 7b9d19f5290b49fc3e04fa244b93923e48f92829 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Thu, 22 Apr 2021 18:05:12 -0500 Subject: [PATCH 5/6] Wrap algorithms for Python. local_clustering_coefficient, louvain_clustering, subgraph_extraction The tests are insufficient and need work. --- python/katana/analytics/__init__.py | 14 ++ .../_local_clustering_coefficient.pyx | 119 +++++++++++ .../katana/analytics/_louvain_clustering.pyx | 188 ++++++++++++++++++ .../katana/analytics/_subgraph_extraction.pyx | 73 +++++++ python/katana/property_graph.pxd.jinja | 2 + python/katana/property_graph.pyx.jinja | 6 + python/katana_setup.py | 2 +- python/tests/test_cpp_algos.py | 52 +++++ 8 files changed, 455 insertions(+), 1 deletion(-) create mode 100644 python/katana/analytics/_local_clustering_coefficient.pyx create mode 100644 python/katana/analytics/_louvain_clustering.pyx create mode 100644 python/katana/analytics/_subgraph_extraction.pyx diff --git a/python/katana/analytics/__init__.py b/python/katana/analytics/__init__.py index dab88ff239..1c981a8fb8 100644 --- a/python/katana/analytics/__init__.py +++ b/python/katana/analytics/__init__.py @@ -30,6 +30,12 @@ .. automodule:: katana.analytics._independent_set +.. automodule:: katana.analytics._louvain_clustering + +.. automodule:: katana.analytics._local_clustering_coefficient + +.. automodule:: katana.analytics._subgraph_extraction + .. automodule:: katana.analytics._jaccard .. automodule:: katana.analytics._k_core @@ -67,6 +73,14 @@ IndependentSetStatistics, ) from katana.analytics._jaccard import jaccard, jaccard_assert_valid, JaccardPlan, JaccardStatistics +from katana.analytics._louvain_clustering import ( + louvain_clustering, + louvain_clustering_assert_valid, + LouvainClusteringPlan, + LouvainClusteringStatistics, +) +from katana.analytics._local_clustering_coefficient import local_clustering_coefficient, LocalClusteringCoefficientPlan +from katana.analytics._subgraph_extraction import subgraph_extraction, SubGraphExtractionPlan from katana.analytics._k_core import k_core, k_core_assert_valid, KCorePlan, KCoreStatistics from katana.analytics._k_truss import k_truss, k_truss_assert_valid, KTrussPlan, KTrussStatistics from katana.analytics._pagerank import pagerank, pagerank_assert_valid, PagerankPlan, PagerankStatistics diff --git a/python/katana/analytics/_local_clustering_coefficient.pyx b/python/katana/analytics/_local_clustering_coefficient.pyx new file mode 100644 index 0000000000..bcc496fd45 --- /dev/null +++ b/python/katana/analytics/_local_clustering_coefficient.pyx @@ -0,0 +1,119 @@ +from katana.cpp.libstd.iostream cimport ostream, ostringstream +from katana.cpp.libgalois.graphs.Graph cimport _PropertyGraph +from katana.cpp.libsupport.result cimport handle_result_void, handle_result_assert, raise_error_code, Result +from katana.analytics.plan cimport Plan, _Plan +from katana.property_graph cimport PropertyGraph + +from libcpp.string cimport string +from libcpp cimport bool + +from enum import Enum + +# TODO(amp): Module needs documenting. + + +cdef extern from "katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h" namespace "katana::analytics" nogil: + cppclass _LocalClusteringCoefficientPlan "katana::analytics::LocalClusteringCoefficientPlan" (_Plan): + enum Algorithm: + kOrderedCountAtomics "katana::analytics::LocalClusteringCoefficientPlan::kOrderedCountAtomics" + kOrderedCountPerThread "katana::analytics::LocalClusteringCoefficientPlan::kOrderedCountPerThread" + + enum Relabeling: + kRelabel "katana::analytics::LocalClusteringCoefficientPlan::kRelabel" + kNoRelabel "katana::analytics::LocalClusteringCoefficientPlan::kNoRelabel" + kAutoRelabel "katana::analytics::LocalClusteringCoefficientPlan::kAutoRelabel" + + _LocalClusteringCoefficientPlan.Algorithm algorithm() const + _LocalClusteringCoefficientPlan.Relabeling relabeling() const + bool edges_sorted() const + + # LocalClusteringCoefficientPlan() + + @staticmethod + _LocalClusteringCoefficientPlan OrderedCountAtomics( + bool edges_sorted, + _LocalClusteringCoefficientPlan.Relabeling relabeling + ) + @staticmethod + _LocalClusteringCoefficientPlan OrderedCountPerThread( + bool edges_sorted, + _LocalClusteringCoefficientPlan.Relabeling relabeling + ) + + _LocalClusteringCoefficientPlan.Relabeling kDefaultRelabeling "katana::analytics::LocalClusteringCoefficientPlan::kDefaultRelabeling" + bool kDefaultEdgesSorted "katana::analytics::LocalClusteringCoefficientPlan::kDefaultEdgesSorted" + + Result[void] LocalClusteringCoefficient(_PropertyGraph* pfg, const string& output_property_name, _LocalClusteringCoefficientPlan plan) + + +class _LocalClusteringCoefficientPlanAlgorithm(Enum): + OrderedCountAtomics = _LocalClusteringCoefficientPlan.Algorithm.kOrderedCountAtomics + OrderedCountPerThread = _LocalClusteringCoefficientPlan.Algorithm.kOrderedCountPerThread + + +cdef _relabeling_to_python(v): + if v == _LocalClusteringCoefficientPlan.Relabeling.kRelabel: + return True + elif v == _LocalClusteringCoefficientPlan.Relabeling.kNoRelabel: + return False + else: + return None + + +cdef _relabeling_from_python(v): + if v is None: + return _LocalClusteringCoefficientPlan.Relabeling.kAutoRelabel + elif v: + return _LocalClusteringCoefficientPlan.Relabeling.kRelabel + else: + return _LocalClusteringCoefficientPlan.Relabeling.kNoRelabel + + +cdef class LocalClusteringCoefficientPlan(Plan): + cdef: + _LocalClusteringCoefficientPlan underlying_ + + cdef _Plan* underlying(self) except NULL: + return &self.underlying_ + + Algorithm = _LocalClusteringCoefficientPlanAlgorithm + + @staticmethod + cdef LocalClusteringCoefficientPlan make(_LocalClusteringCoefficientPlan u): + f = LocalClusteringCoefficientPlan.__new__(LocalClusteringCoefficientPlan) + f.underlying_ = u + return f + + @property + def algorithm(self) -> Algorithm: + return _LocalClusteringCoefficientPlanAlgorithm(self.underlying_.algorithm()) + + @property + def relabeling(self): + return self.underlying_.relabeling() + + @property + def edges_sorted(self) -> bool: + return self.underlying_.edges_sorted() + + @staticmethod + def ordered_count_atomics( + relabeling = _relabeling_to_python(kDefaultRelabeling), + bool edges_sorted = kDefaultEdgesSorted + ): + return LocalClusteringCoefficientPlan.make(_LocalClusteringCoefficientPlan.OrderedCountAtomics( + edges_sorted, _relabeling_from_python(relabeling))) + + @staticmethod + def ordered_count_per_thread( + relabeling = _relabeling_to_python(kDefaultRelabeling), + bool edges_sorted = kDefaultEdgesSorted + ): + return LocalClusteringCoefficientPlan.make(_LocalClusteringCoefficientPlan.OrderedCountPerThread( + edges_sorted, _relabeling_from_python(relabeling))) + + +def local_clustering_coefficient(PropertyGraph pg, str output_property_name, LocalClusteringCoefficientPlan plan = LocalClusteringCoefficientPlan()): + cdef string output_property_name_str = bytes(output_property_name, "utf-8") + with nogil: + handle_result_void(LocalClusteringCoefficient(pg.underlying.get(), output_property_name_str, plan.underlying_)) diff --git a/python/katana/analytics/_louvain_clustering.pyx b/python/katana/analytics/_louvain_clustering.pyx new file mode 100644 index 0000000000..c5d1d498f8 --- /dev/null +++ b/python/katana/analytics/_louvain_clustering.pyx @@ -0,0 +1,188 @@ +from katana.cpp.libstd.iostream cimport ostream, ostringstream +from katana.cpp.libgalois.graphs.Graph cimport _PropertyGraph +from katana.cpp.libsupport.result cimport handle_result_void, handle_result_assert, raise_error_code, Result +from katana.analytics.plan cimport Plan, _Plan +from katana.property_graph cimport PropertyGraph + +from libc.stdint cimport uint32_t, uint64_t +from libcpp.string cimport string +from libcpp cimport bool + +from enum import Enum + +# TODO(amp): Module needs documenting. + + +cdef extern from "katana/analytics/louvain_clustering/louvain_clustering.h" namespace "katana::analytics" nogil: + cppclass _LouvainClusteringPlan "katana::analytics::LouvainClusteringPlan" (_Plan): + enum Algorithm: + kDoAll "katana::analytics::LouvainClusteringPlan::kDoAll" + + _LouvainClusteringPlan.Algorithm algorithm() const + bool enable_vf() const + double modularity_threshold_per_round() const + double modularity_threshold_total() const + uint32_t max_iterations() const + uint32_t min_graph_size() const + + # LouvainClusteringPlan() + + @staticmethod + _LouvainClusteringPlan DoAll( + bool enable_vf, + double modularity_threshold_per_round, + double modularity_threshold_total, + uint32_t max_iterations, + uint32_t min_graph_size + ) + + bool kDefaultEnableVF "katana::analytics::LouvainClusteringPlan::kDefaultEnableVF" + double kDefaultModularityThresholdPerRound "katana::analytics::LouvainClusteringPlan::kDefaultModularityThresholdPerRound" + double kDefaultModularityThresholdTotal "katana::analytics::LouvainClusteringPlan::kDefaultModularityThresholdTotal" + uint32_t kDefaultMaxIterations "katana::analytics::LouvainClusteringPlan::kDefaultMaxIterations" + uint32_t kDefaultMinGraphSize "katana::analytics::LouvainClusteringPlan::kDefaultMinGraphSize" + + Result[void] LouvainClustering(_PropertyGraph* pfg, const string& edge_weight_property_name,const string& output_property_name, _LouvainClusteringPlan plan) + + Result[void] LouvainClusteringAssertValid(_PropertyGraph* pfg, + const string& edge_weight_property_name, + const string& output_property_name + ) + + cppclass _LouvainClusteringStatistics "katana::analytics::LouvainClusteringStatistics": + uint64_t n_clusters + uint64_t n_non_trivial_clusters + uint64_t largest_cluster_size + double largest_cluster_proportion + double modularity + + void Print(ostream os) + + @staticmethod + Result[_LouvainClusteringStatistics] Compute(_PropertyGraph* pfg, + const string& edge_weight_property_name, + const string& output_property_name + ) + + +class _LouvainClusteringPlanAlgorithm(Enum): + DoAll = _LouvainClusteringPlan.Algorithm.kDoAll + + +cdef class LouvainClusteringPlan(Plan): + cdef: + _LouvainClusteringPlan underlying_ + + cdef _Plan* underlying(self) except NULL: + return &self.underlying_ + + Algorithm = _LouvainClusteringPlanAlgorithm + + @staticmethod + cdef LouvainClusteringPlan make(_LouvainClusteringPlan u): + f = LouvainClusteringPlan.__new__(LouvainClusteringPlan) + f.underlying_ = u + return f + + @property + def algorithm(self) -> Algorithm: + return _LouvainClusteringPlanAlgorithm(self.underlying_.algorithm()) + + @property + def enable_vf(self) -> bool: + return self.underlying_.enable_vf() + + @property + def modularity_threshold_per_round(self) -> double: + return self.underlying_.modularity_threshold_per_round() + + @property + def modularity_threshold_total(self) -> double: + return self.underlying_.modularity_threshold_total() + + @property + def max_iterations(self) -> uint32_t: + return self.underlying_.max_iterations() + + @property + def min_graph_size(self) -> uint32_t: + return self.underlying_.min_graph_size() + + + @staticmethod + def do_all( + bool enable_vf = kDefaultEnableVF, + double modularity_threshold_per_round = kDefaultModularityThresholdPerRound, + double modularity_threshold_total = kDefaultModularityThresholdTotal, + uint32_t max_iterations = kDefaultMaxIterations, + uint32_t min_graph_size = kDefaultMinGraphSize + ) -> LouvainClusteringPlan: + return LouvainClusteringPlan.make(_LouvainClusteringPlan.DoAll( + enable_vf, modularity_threshold_per_round, modularity_threshold_total, max_iterations, min_graph_size)) + + +def louvain_clustering(PropertyGraph pg, str edge_weight_property_name, str output_property_name, LouvainClusteringPlan plan = LouvainClusteringPlan()): + cdef string edge_weight_property_name_str = bytes(edge_weight_property_name, "utf-8") + cdef string output_property_name_str = bytes(output_property_name, "utf-8") + with nogil: + handle_result_void(LouvainClustering(pg.underlying.get(), edge_weight_property_name_str, output_property_name_str, plan.underlying_)) + + +def louvain_clustering_assert_valid(PropertyGraph pg, str edge_weight_property_name, str output_property_name ): + cdef string edge_weight_property_name_str = bytes(edge_weight_property_name, "utf-8") + cdef string output_property_name_str = bytes(output_property_name, "utf-8") + with nogil: + handle_result_assert(LouvainClusteringAssertValid(pg.underlying.get(), + edge_weight_property_name_str, + output_property_name_str + )) + + +cdef _LouvainClusteringStatistics handle_result_LouvainClusteringStatistics(Result[_LouvainClusteringStatistics] res) nogil except *: + if not res.has_value(): + with gil: + raise_error_code(res.error()) + return res.value() + + +cdef class LouvainClusteringStatistics: + cdef _LouvainClusteringStatistics underlying + + def __init__(self, PropertyGraph pg, + str edge_weight_property_name, + str output_property_name + ): + cdef string edge_weight_property_name_str = bytes(edge_weight_property_name, "utf-8") + cdef string output_property_name_str = bytes(output_property_name, "utf-8") + with nogil: + self.underlying = handle_result_LouvainClusteringStatistics(_LouvainClusteringStatistics.Compute( + pg.underlying.get(), + edge_weight_property_name_str, + output_property_name_str + )) + + @property + def n_clusters(self) -> uint64_t: + return self.underlying.n_clusters + + @property + def n_non_trivial_clusters(self) -> uint64_t: + return self.underlying.n_non_trivial_clusters + + @property + def largest_cluster_size(self) -> uint64_t: + return self.underlying.largest_cluster_size + + @property + def largest_cluster_proportion(self) -> double: + return self.underlying.largest_cluster_proportion + + @property + def modularity(self) -> double: + return self.underlying.modularity + + + def __str__(self) -> str: + cdef ostringstream ss + self.underlying.Print(ss) + return str(ss.str(), "ascii") diff --git a/python/katana/analytics/_subgraph_extraction.pyx b/python/katana/analytics/_subgraph_extraction.pyx new file mode 100644 index 0000000000..3efddfc482 --- /dev/null +++ b/python/katana/analytics/_subgraph_extraction.pyx @@ -0,0 +1,73 @@ +from katana.cpp.libgalois.graphs.Graph cimport _PropertyGraph +from katana.cpp.libstd.iostream cimport ostream, ostringstream +from katana.cpp.libsupport.result cimport handle_result_void, handle_result_assert, raise_error_code, Result +from katana.analytics.plan cimport Plan, _Plan +from katana.property_graph cimport PropertyGraph + +from libcpp.vector cimport vector +from libc.stdint cimport uint32_t +from pyarrow.lib cimport to_shared +from libcpp.memory cimport shared_ptr, unique_ptr +from katana.cpp.libsupport.result cimport Result + +from enum import Enum + +# TODO(amp): Module needs documenting. + + +cdef extern from "katana/analytics/subgraph_extraction/subgraph_extraction.h" namespace "katana::analytics" nogil: + cppclass _SubGraphExtractionPlan "katana::analytics::SubGraphExtractionPlan" (_Plan): + enum Algorithm: + kNodeSet "katana::analytics::SubGraphExtractionPlan::kNodeSet" + + _SubGraphExtractionPlan.Algorithm algorithm() const + + # SubGraphExtractionPlan() + + @staticmethod + _SubGraphExtractionPlan NodeSet( + ) + + Result[unique_ptr[_PropertyGraph]] SubGraphExtraction(_PropertyGraph* pfg, const vector[uint32_t]& node_vec, _SubGraphExtractionPlan plan) + + +class _SubGraphExtractionPlanAlgorithm(Enum): + NodeSet = _SubGraphExtractionPlan.Algorithm.kNodeSet + + +cdef class SubGraphExtractionPlan(Plan): + cdef: + _SubGraphExtractionPlan underlying_ + + cdef _Plan* underlying(self) except NULL: + return &self.underlying_ + + Algorithm = _SubGraphExtractionPlanAlgorithm + + @staticmethod + cdef SubGraphExtractionPlan make(_SubGraphExtractionPlan u): + f = SubGraphExtractionPlan.__new__(SubGraphExtractionPlan) + f.underlying_ = u + return f + + @property + def algorithm(self) -> Algorithm: + return _SubGraphExtractionPlanAlgorithm(self.underlying_.algorithm()) + + @staticmethod + def node_set() -> SubGraphExtractionPlan: + return SubGraphExtractionPlan.make(_SubGraphExtractionPlan.NodeSet()) + + +cdef shared_ptr[_PropertyGraph] handle_result_property_graph(Result[unique_ptr[_PropertyGraph]] res) nogil except *: + if not res.has_value(): + with gil: + raise_error_code(res.error()) + return to_shared(res.value()) + + +def subgraph_extraction(PropertyGraph pg, node_vec, SubGraphExtractionPlan plan = SubGraphExtractionPlan()) -> PropertyGraph: + cdef vector[uint32_t] vec = [n for n in node_vec] + with nogil: + v = handle_result_property_graph(SubGraphExtraction(pg.underlying.get(), vec, plan.underlying_)) + return PropertyGraph.make(v) diff --git a/python/katana/property_graph.pxd.jinja b/python/katana/property_graph.pxd.jinja index e473383709..d525d0adc3 100644 --- a/python/katana/property_graph.pxd.jinja +++ b/python/katana/property_graph.pxd.jinja @@ -14,6 +14,8 @@ cdef class PropertyGraph: @staticmethod cdef uint64_t _property_name_to_id(object prop, Schema schema) except -1 + @staticmethod + cdef PropertyGraph make(shared_ptr[_PropertyGraph] u) cdef GraphTopology topology(PropertyGraph) cpdef uint64_t num_nodes(PropertyGraph) diff --git a/python/katana/property_graph.pyx.jinja b/python/katana/property_graph.pyx.jinja index f360ce7afb..eb481247fb 100644 --- a/python/katana/property_graph.pyx.jinja +++ b/python/katana/property_graph.pyx.jinja @@ -60,6 +60,12 @@ cdef class PropertyGraph: opts.edge_properties = &edge_props self.underlying = handle_result_value(_PropertyGraph.Make(bytes(path, "utf-8"), opts)) + @staticmethod + cdef PropertyGraph make(shared_ptr[_PropertyGraph] u): + f = PropertyGraph.__new__(PropertyGraph) + f.underlying = u + return f + def write(self, path, command_line) : """ Write the property graph out the specified path or URL (or the original path it was loaded from if path is nor provided). Provide lineage information in the form of a command line. diff --git a/python/katana_setup.py b/python/katana_setup.py index 01878e1cd7..ae46dcb7ad 100644 --- a/python/katana_setup.py +++ b/python/katana_setup.py @@ -102,7 +102,7 @@ def _get_build_extension(): def test_cython_module(name, cython_code, python_code="", extension_options=None): extension_options = extension_options or {} - require_python_module("cython") + require_python_module("Cython") import Cython.Build import Cython.Build.Inline diff --git a/python/tests/test_cpp_algos.py b/python/tests/test_cpp_algos.py index 09e39e7bc3..26a7567a2f 100644 --- a/python/tests/test_cpp_algos.py +++ b/python/tests/test_cpp_algos.py @@ -1,6 +1,7 @@ import numpy as np from pyarrow import Schema, table from pytest import approx, raises +import pytest from katana import GaloisError from katana.analytics import * @@ -291,3 +292,54 @@ def test_k_truss_fail(): with raises(GaloisError): k_truss(property_graph, 1, "output2") + + +def test_louvain_clustering(): + property_graph = PropertyGraph(get_input("propertygraphs/rmat10_symmetric")) + + louvain_clustering(property_graph, "value", "output") + + louvain_clustering_assert_valid(property_graph, "value", "output") + + stats = LouvainClusteringStatistics(property_graph, "value", "output") + + # TODO(amp): This is non-deterministic. Are there bounds on the results we could check? + # assert stats.n_clusters == 83 + # assert stats.n_non_trivial_clusters == 13 + # assert stats.largest_cluster_size == 297 + + +def test_local_clustering_coefficient(): + property_graph = PropertyGraph(get_input("propertygraphs/rmat15_cleaned_symmetric")) + + local_clustering_coefficient(property_graph, "output") + property_graph: PropertyGraph + out = property_graph.get_node_property("output") + + assert out[-1].as_py() == 0 + assert not np.any(np.isnan(out)) + + +def test_subgraph_extraction(): + property_graph = PropertyGraph(get_input("propertygraphs/rmat15_cleaned_symmetric")) + sort_all_edges_by_dest(property_graph) + nodes = [1, 3, 11, 120] + + expected_edges = [ + [ + nodes.index(property_graph.get_edge_dst(e)) + for e in property_graph.edges(i) + if property_graph.get_edge_dst(e) in nodes + ] + for i in nodes + ] + + pg = subgraph_extraction(property_graph, nodes) + + assert isinstance(pg, PropertyGraph) + assert len(pg) == len(nodes) + assert pg.num_edges() == 6 + + for i in range(len(expected_edges)): + assert len(pg.edges(i)) == len(expected_edges[i]) + assert [pg.get_edge_dst(e) for e in pg.edges(i)] == expected_edges[i] From 98964848f1fead58f223aef744c3bf72728f5769 Mon Sep 17 00:00:00 2001 From: Arthur Peters Date: Sun, 25 Apr 2021 17:41:14 -0500 Subject: [PATCH 6/6] tsuba: Include the property names in the error if not unique. --- libtsuba/src/RDGCore.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libtsuba/src/RDGCore.cpp b/libtsuba/src/RDGCore.cpp index 5562b30c2e..8d329207f0 100644 --- a/libtsuba/src/RDGCore.cpp +++ b/libtsuba/src/RDGCore.cpp @@ -57,7 +57,8 @@ UpsertProperties( if (!next->schema()->HasDistinctFieldNames()) { return KATANA_ERROR( - tsuba::ErrorCode::Exists, "column names are not distinct"); + tsuba::ErrorCode::Exists, "column names are not distinct: {}", + fmt::join(next->schema()->field_names(), ", ")); } *to_update = next;