KatanaGraph · arthurp · Apr 21, 2021 · arthurp · Apr 21, 2021 · gurbinder533
diff --git a/...lois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h b/...lois/include/katana/analytics/local_clustering_coefficient/local_clustering_coefficient.h
@@ -45,6 +45,7 @@ class LocalClusteringCoefficientPlan : public Plan {
             kDefaultRelabeling} {}
 
   Algorithm algorithm() const { return algorithm_; }
+  // TODO(amp): These parameters should be documented.
   Relabeling relabeling() const { return relabeling_; }
   bool edges_sorted() const { return edges_sorted_; }
 
@@ -70,6 +71,7 @@ class LocalClusteringCoefficientPlan : public Plan {
   }
 };
 
+// TODO(amp): The doc string was not updated.
 /**
  * Count the total number of triangles in the graph. The graph must be
  * symmetric!

diff --git a/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h b/libgalois/include/katana/analytics/louvain_clustering/louvain_clustering.h
@@ -13,7 +13,6 @@ namespace katana::analytics {
 /// parameters associated with it.
 class LouvainClusteringPlan : public Plan {
 public:
-  /// Algorithm selectors for Single-Source Shortest Path
   enum Algorithm {
     kDoAll,
   };
@@ -53,9 +52,17 @@ class LouvainClusteringPlan : public Plan {
 
 public:
   LouvainClusteringPlan()
-      : LouvainClusteringPlan{kCPU, kDoAll, false, 0.01, 0.01, 10, 100} {}
+      : LouvainClusteringPlan{
+            kCPU,
+            kDoAll,
+            kEnableVF,
+            kModularityThresholdPerRound,
+            kModularityThresholdTotal,
+            kMaxIterations,
+            kMinGraphSize} {}
 
   Algorithm algorithm() const { return algorithm_; }
+  // TODO(amp): These parameters should be documented.
   bool is_enable_vf() const { return enable_vf_; }
   double modularity_threshold_per_round() const {
     return modularity_threshold_per_round_;
@@ -86,7 +93,7 @@ class LouvainClusteringPlan : public Plan {
 /// Compute the Louvain Clustering for pg.
 /// The edge weights are taken from the property named
 /// edge_weight_property_name (which may be a 32- or 64-bit sign or unsigned
-/// int), and the computed cluster ids are stored in the property named
+/// int), and the computed cluster IDs are stored in the property named
 /// output_property_name (as uint32_t).
 /// The property named output_property_name is created by this function and may
 /// not exist before the call.

diff --git a/libgalois/include/katana/analytics/random_walks/random_walks.h b/libgalois/include/katana/analytics/random_walks/random_walks.h
@@ -14,13 +14,19 @@ namespace katana::analytics {
 
 /// A computational plan to for random walks, specifying the algorithm and any
 /// parameters associated with it.
-class RandomWalksPlan : Plan {
-  enum Algo { node2vec, edge2vec };
-
+class RandomWalksPlan : public Plan {
 public:
   /// Algorithm selectors for Connected-components
   enum Algorithm { kNode2Vec, kEdge2Vec };
 
+  static const Algorithm kDefaultAlgorithm = kNode2Vec;
+  static const uint32_t kDefaultWalkLength = 1;
+  static const uint32_t kDefaultNumberOfWalks = 1;
+  constexpr static const double kDefaultBackwardProbability = 1.0;
+  constexpr static const double kDefaultForwardProbability = 1.0;
+  static const uint32_t kDefaultMaxIterations = 10;
+  static const uint32_t kDefaultNumberOfEdgeTypes = 1;
+
   // Don't allow people to directly construct these, so as to have only one
   // consistent way to configure.
 private:
@@ -50,23 +56,51 @@ class RandomWalksPlan : Plan {
         number_of_edge_types_(number_of_edge_types) {}
 
 public:
-  // kChunkSize is a fixed const int (default value: 1)
+  // kChunkSize is fixed at 1
   static const int kChunkSize;
 
-  RandomWalksPlan() : RandomWalksPlan{kCPU, kNode2Vec, 1, 1, 1.0, 1.0, 10, 1} {}
+  RandomWalksPlan()
+      : RandomWalksPlan{
+            kCPU,
+            kDefaultAlgorithm,
+            kDefaultWalkLength,
+            kDefaultNumberOfWalks,
+            kDefaultBackwardProbability,
+            kDefaultForwardProbability,
+            kDefaultMaxIterations,
+            kDefaultNumberOfEdgeTypes} {}
 
   Algorithm algorithm() const { return algorithm_; }
+
+  // TODO(amp): The parameters walk_length, number_of_walks,
+  //  backward_probability, and forward_probability control the expected output,
+  //  not the algorithm used to compute the output. So they need to be parameters
+  //  on the algorithm function, not in the plan. The plan should be parameters
+  //  which do not change the expected output (though they may cause selecting a
+  //  different correct output).
+
+  /// Length of random walks.
   uint32_t walk_length() const { return walk_length_; }
+
+  /// Number of walks per node.
   uint32_t number_of_walks() const { return number_of_walks_; }
+
+  /// Probability of moving back to parent.
   double backward_probability() const { return backward_probability_; }
+
+  /// Probability of moving forward (2-hops).
   double forward_probability() const { return forward_probability_; }
+
   uint32_t max_iterations() const { return max_iterations_; }
+
   uint32_t number_of_edge_types() const { return number_of_edge_types_; }
 
   /// Node2Vec algorithm to generate random walks on the graph
   static RandomWalksPlan Node2Vec(
-      uint32_t walk_length, uint32_t number_of_walks,
-      double backward_probability, double forward_probability) {
+      uint32_t walk_length = kDefaultWalkLength,
+      uint32_t number_of_walks = kDefaultNumberOfWalks,
+      double backward_probability = kDefaultBackwardProbability,
+      double forward_probability = kDefaultBackwardProbability) {
     return {
         kCPU,
         kNode2Vec,
@@ -81,9 +115,12 @@ class RandomWalksPlan : Plan {
   /// Edge2Vec algorithm to generate random walks on the graph.
   /// Takes the heterogeneity of the edges into account
   static RandomWalksPlan Edge2Vec(
-      uint32_t walk_length, uint32_t number_of_walks,
-      double backward_probability, double forward_probability,
-      uint32_t max_iterations, uint32_t number_of_edge_types) {
+      uint32_t walk_length = kDefaultWalkLength,
+      uint32_t number_of_walks = kDefaultNumberOfWalks,
+      double backward_probability = kDefaultBackwardProbability,
+      double forward_probability = kDefaultBackwardProbability,
+      uint32_t max_iterations = kDefaultMaxIterations,
+      uint32_t number_of_edge_types = kDefaultNumberOfEdgeTypes) {
     return {
         kCPU,
         kNode2Vec,
@@ -96,11 +133,10 @@ class RandomWalksPlan : Plan {
   }
 };
 
-/// Compute the random-walks for pg. The pg is expected to be
-/// symmetric.
-/// The parameters can be specified, but have reasonable defaults. Not all parameters
-/// are used by the algorithms.
-/// The generated random-walks generated are return in Katana::InsertBag.
+/// Compute the random-walks for pg. The pg is expected to be symmetric. The
+/// parameters can be specified, but have reasonable defaults. Not all
+/// parameters are used by the algorithms. The generated random-walks generated
+/// are returned as a vector of vectors.
 KATANA_EXPORT Result<std::vector<std::vector<uint32_t>>> RandomWalks(
     PropertyGraph* pg, RandomWalksPlan plan = RandomWalksPlan());
 

diff --git a/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h b/libgalois/include/katana/analytics/subgraph_extraction/subgraph_extraction.h
@@ -26,6 +26,14 @@ class SubGraphExtractionPlan : public Plan {
 
   Algorithm algorithm() const { return algorithm_; }
 
+  // TODO(amp): This algorithm defines the semantics of the call. If there were
+  //  an algorithm that, for instance, took a list of edges, that would need to
+  //  be a different function, not just a different plan, since it takes
+  //  semantically different arguments. I do think this should have a plan, even
+  //  if there is only one concrete algorithm, but it should be defined and
+  //  documented in terms of the concrete algorithm, not the semantics of the
+  //  function (which is described well below).
+
   /**
    * The node-set algorithm:
    *    Given a set of node ids, this algorithm constructs a new sub-graph
@@ -41,12 +49,13 @@ class SubGraphExtractionPlan : public Plan {
  * The new sub-graph is independent of the original graph.
  *
  * @param pg The graph to process.
- * @param node_vec Set of node ids
+ * @param node_vec Set of node IDs
  * @param plan
  */
 KATANA_EXPORT katana::Result<std::unique_ptr<katana::PropertyGraph>>
 SubGraphExtraction(
-    katana::PropertyGraph* pg, const std::vector<uint32_t>& node_vec,
+    katana::PropertyGraph* pg,
+    const std::vector<katana::PropertyGraph::Node>& node_vec,
     SubGraphExtractionPlan plan = {});
 // const std::vector<std::string>& node_properties_to_copy, const std::vector<std::string>& edge_properties_to_copy);
 

diff --git a/libgalois/src/analytics/random_walks/random_walks.cpp b/libgalois/src/analytics/random_walks/random_walks.cpp
@@ -44,9 +44,8 @@ struct Node2VecAlgo {
       return graph.num_nodes();
     }
     double total_wt = degree[n];
-    prob = prob * total_wt;
 
-    uint32_t edge_index = std::floor(prob);
+    uint32_t edge_index = std::floor(prob * total_wt);
     auto edge = graph.edge_begin(n) + edge_index;
     return *graph.GetEdgeDest(edge);
   }
@@ -79,7 +78,7 @@ struct Node2VecAlgo {
     uint64_t total_walks = graph.size() * plan_.number_of_walks();
 
     katana::do_all(
-        katana::iterate((uint64_t)0, total_walks),
+        katana::iterate(uint64_t(0), total_walks),
         [&](uint64_t idx) {
           GNode n = idx % graph.size();
 
@@ -144,7 +143,7 @@ struct Node2VecAlgo {
                   alpha = prob_forward;
                 }
 
-                if (alpha >= y) {
+                if (y <= alpha) {
                   //accept y
                   walk.push_back(std::move(nbr));
                   break;
@@ -153,7 +152,7 @@ struct Node2VecAlgo {
             }
           }
 
-          (*walks).push(std::move(walk));
+          walks->push(std::move(walk));
         },
         katana::steal(), katana::chunk_size<RandomWalksPlan::kChunkSize>(),
         katana::loopname("Node2vec walks"), katana::no_stats());
@@ -234,7 +233,7 @@ struct Edge2VecAlgo {
     uint64_t total_walks = graph.size() * plan_.number_of_walks();
 
     katana::do_all(
-        katana::iterate((uint64_t)0, total_walks),
+        katana::iterate(uint64_t(0), total_walks),
         [&](uint64_t idx) {
           GNode n = idx % graph.size();
 
@@ -338,8 +337,8 @@ struct Edge2VecAlgo {
             num_edge_types[type]++;
           }
 
-          (*per_thread_num_edge_types_walks.getLocal())
-              .emplace_back(std::move(num_edge_types));
+          per_thread_num_edge_types_walks.getLocal()->emplace_back(
+              std::move(num_edge_types));
         });
 
     for (unsigned j = 0; j < katana::getActiveThreads(); ++j) {
@@ -359,7 +358,7 @@ struct Edge2VecAlgo {
     transformed_num_edge_types_walks.resize(plan_.number_of_edge_types() + 1);
 
     katana::do_all(
-        katana::iterate((uint32_t)0, plan_.number_of_edge_types() + 1),
+        katana::iterate(uint32_t(0), plan_.number_of_edge_types() + 1),
         [&](uint32_t j) {
           for (uint32_t i = 0; i < rows; i++) {
             transformed_num_edge_types_walks[j].push_back(
@@ -395,8 +394,8 @@ struct Edge2VecAlgo {
       const std::vector<std::vector<uint32_t>>&
           transformed_num_edge_types_walks,
       const std::vector<double>& means) {
-    std::vector<uint32_t> x = transformed_num_edge_types_walks[i];
-    std::vector<uint32_t> y = transformed_num_edge_types_walks[j];
+    const std::vector<uint32_t>& x = transformed_num_edge_types_walks[i];
+    const std::vector<uint32_t>& y = transformed_num_edge_types_walks[j];
 
     double sum = 0.0;
     double sig1 = 0.0;
@@ -425,7 +424,7 @@ struct Edge2VecAlgo {
           transformed_num_edge_types_walks,
       const std::vector<double>& means) {
     katana::do_all(
-        katana::iterate((uint32_t)1, plan_.number_of_edge_types() + 1),
+        katana::iterate(uint32_t(1), plan_.number_of_edge_types() + 1),
         [&](uint32_t i) {
           for (uint32_t j = 1; j <= plan_.number_of_edge_types(); j++) {
             double pearson_corr =
@@ -468,12 +467,11 @@ struct Edge2VecAlgo {
 template <typename Graph>
 void
 InitializeDegrees(const Graph& graph, katana::LargeArray<uint64_t>* degree) {
-  katana::do_all(
-      katana::iterate(graph),
-      [&](typename Graph::Node n) {
-        (*degree)[n] = std::distance(graph.edge_begin(n), graph.edge_end(n));
-      },
-      katana::steal());
+  katana::do_all(katana::iterate(graph), [&](typename Graph::Node n) {
+    // Treat this as O(1) time because subtracting iterators is just pointer
+    // or number subtraction. So don't use steal().
+    (*degree)[n] = graph.edges(n).size();
+  });
 }
 
 }  //namespace
@@ -485,6 +483,12 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) {
     return res.error();
   }
 
+  // TODO(amp): This is incorrect. For Node2vec this needs to be:
+  //    Algorithm::Graph::Make(pg, {}, {}) // Ignoring all properties.
+  //  For Edge2vec this needs to be:
+  //    Algorithm::Graph::Make(pg, {}, {edge_type_property_name})
+  //  The current version requires the input to have exactly the properties
+  //  expected by the algorithm implementation.
   auto pg_result = Algorithm::Graph::Make(pg);
   if (!pg_result) {
     return pg_result.error();
@@ -510,11 +514,8 @@ RandomWalksWithWrap(katana::PropertyGraph* pg, RandomWalksPlan plan) {
   degree.deallocate();
 
   std::vector<std::vector<uint32_t>> walks_in_vector;
-  walks_in_vector.reserve(pg->num_nodes() * plan.number_of_walks());
-  // Copy to vector of vectors.
-  for (auto walk : walks) {
-    walks_in_vector.push_back(walk);
-  }
+  walks_in_vector.reserve(plan.number_of_walks());
+  std::move(walks.begin(), walks.end(), std::back_inserter(walks_in_vector));
   return walks_in_vector;
 }