From 97e7d837b7379f5f185ced360aa10144b1067b24 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Mon, 22 Feb 2021 19:19:02 -0500
Subject: [PATCH 01/18] Refactor Louvain with new graph primitives

 1) Fix Dendrogram methods to be const consistent
 2) Update experimental update_by_delta_modularity
    to use new graph primitives.  Clean up unused
    local code
---
 cpp/src/community/louvain.cuh    |    4 +-
 cpp/src/experimental/louvain.cuh | 1171 +++++-------------------------
 2 files changed, 178 insertions(+), 997 deletions(-)
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index e28f0f1746d..a0f7f1c2421 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -138,7 +138,9 @@ class Louvain {
     return Q;
   }
 
-  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
+  Dendrogram<vertex_t> const &get_dendrogram() const { return *dendrogram_; }
+
+  Dendrogram<vertex_t> &get_dendrogram() { return *dendrogram_; }
 
   std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
 
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index f162cd17a61..3597923d6b9 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -29,11 +29,11 @@
 #include <experimental/graph_functions.hpp>
 #include <patterns/copy_to_adj_matrix_row_col.cuh>
 #include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
+#include <patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh>
 #include <patterns/transform_reduce_e.cuh>
 #include <patterns/transform_reduce_v.cuh>
 
-#include <experimental/include_cuco_static_map.cuh>
-
 #include <community/dendrogram.cuh>
 
 #include <numeric>
@@ -47,343 +47,6 @@
 namespace cugraph {
 namespace experimental {
 
-namespace detail {
-
-#ifdef CUCO_STATIC_MAP_DEFINED
-template <typename data_t>
-struct create_cuco_pair_t {
-  cuco::pair_type<data_t, data_t> __device__ operator()(data_t data)
-  {
-    cuco::pair_type<data_t, data_t> tmp;
-    tmp.first  = data;
-    tmp.second = data_t{0};
-    return tmp;
-  }
-};
-#endif
-
-//
-// These classes should allow cuco::static_map to generate hash tables of
-// different configurations.
-//
-
-//
-//  Compare edges based on src[e] and dst[e] matching
-//
-template <typename data_t, typename sentinel_t>
-class src_dst_equality_comparator_t {
- public:
-  src_dst_equality_comparator_t(rmm::device_vector<data_t> const &src,
-                                rmm::device_vector<data_t> const &dst,
-                                sentinel_t sentinel_value)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}, sentinel_value_(sentinel_value)
-  {
-  }
-
-  src_dst_equality_comparator_t(data_t const *d_src, data_t const *d_dst, sentinel_t sentinel_value)
-    : d_src_{d_src}, d_dst_{d_dst}, sentinel_value_(sentinel_value)
-  {
-  }
-
-  template <typename idx_type>
-  __device__ bool operator()(idx_type lhs_index, idx_type rhs_index) const noexcept
-  {
-    return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) &&
-           (d_src_[lhs_index] == d_src_[rhs_index]) && (d_dst_[lhs_index] == d_dst_[rhs_index]);
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  sentinel_t sentinel_value_;
-};
-
-//
-//  Hash edges based src[e] and dst[e]
-//
-template <typename data_t>
-class src_dst_hasher_t {
- public:
-  src_dst_hasher_t(rmm::device_vector<data_t> const &src, rmm::device_vector<data_t> const &dst)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}
-  {
-  }
-
-  src_dst_hasher_t(data_t const *d_src, data_t const *d_dst) : d_src_{d_src}, d_dst_{d_dst} {}
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    cuco::detail::MurmurHash3_32<data_t> hasher;
-
-    auto h_src = hasher(d_src_[index]);
-    auto h_dst = hasher(d_dst_[index]);
-
-    /*
-     * Combine the source hash and the dest hash into a single hash value
-     *
-     * Taken from the Boost hash_combine function
-     * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-     */
-    h_src ^= h_dst + 0x9e3779b9 + (h_src << 6) + (h_src >> 2);
-
-    return h_src;
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-};
-
-//
-//  Compare edges based on src[e] and cluster[dst[e]] matching
-//
-template <typename data_t, typename sentinel_t>
-class src_cluster_equality_comparator_t {
- public:
-  src_cluster_equality_comparator_t(rmm::device_vector<data_t> const &src,
-                                    rmm::device_vector<data_t> const &dst,
-                                    rmm::device_vector<data_t> const &dst_cluster_cache,
-                                    data_t base_dst_id,
-                                    sentinel_t sentinel_value)
-    : d_src_{src.data().get()},
-      d_dst_{dst.data().get()},
-      d_dst_cluster_{dst_cluster_cache.data().get()},
-      base_dst_id_(base_dst_id),
-      sentinel_value_(sentinel_value)
-  {
-  }
-
-  src_cluster_equality_comparator_t(data_t const *d_src,
-                                    data_t const *d_dst,
-                                    data_t const *d_dst_cluster_cache,
-                                    data_t base_dst_id,
-                                    sentinel_t sentinel_value)
-    : d_src_{d_src},
-      d_dst_{d_dst},
-      d_dst_cluster_{d_dst_cluster_cache},
-      base_dst_id_(base_dst_id),
-      sentinel_value_(sentinel_value)
-  {
-  }
-
-  __device__ bool operator()(sentinel_t lhs_index, sentinel_t rhs_index) const noexcept
-  {
-    return (lhs_index != sentinel_value_) && (rhs_index != sentinel_value_) &&
-           (d_src_[lhs_index] == d_src_[rhs_index]) &&
-           (d_dst_cluster_[d_dst_[lhs_index] - base_dst_id_] ==
-            d_dst_cluster_[d_dst_[rhs_index] - base_dst_id_]);
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  data_t const *d_dst_cluster_;
-  data_t base_dst_id_;
-  sentinel_t sentinel_value_;
-};
-
-//
-//  Hash edges based src[e] and cluster[dst[e]]
-//
-template <typename data_t>
-class src_cluster_hasher_t {
- public:
-  src_cluster_hasher_t(rmm::device_vector<data_t> const &src,
-                       rmm::device_vector<data_t> const &dst,
-                       rmm::device_vector<data_t> const &dst_cluster_cache,
-                       data_t base_dst_id)
-    : d_src_{src.data().get()},
-      d_dst_{dst.data().get()},
-      d_dst_cluster_{dst_cluster_cache.data().get()},
-      base_dst_id_(base_dst_id)
-  {
-  }
-
-  src_cluster_hasher_t(data_t const *d_src,
-                       data_t const *d_dst,
-                       data_t const *d_dst_cluster_cache,
-                       data_t base_dst_id)
-    : d_src_{d_src}, d_dst_{d_dst}, d_dst_cluster_{d_dst_cluster_cache}, base_dst_id_(base_dst_id)
-  {
-  }
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    cuco::detail::MurmurHash3_32<data_t> hasher;
-
-    auto h_src     = hasher(d_src_[index]);
-    auto h_cluster = hasher(d_dst_cluster_[d_dst_[index] - base_dst_id_]);
-
-    /*
-     * Combine the source hash and the cluster hash into a single hash value
-     *
-     * Taken from the Boost hash_combine function
-     * https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
-     */
-    h_src ^= h_cluster + 0x9e3779b9 + (h_src << 6) + (h_src >> 2);
-
-    return h_src;
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-  data_t const *d_dst_cluster_;
-  data_t base_dst_id_;
-};
-
-//
-// Skip edges where src[e] == dst[e]
-//
-template <typename data_t>
-class skip_edge_t {
- public:
-  skip_edge_t(rmm::device_vector<data_t> const &src, rmm::device_vector<data_t> const &dst)
-    : d_src_{src.data().get()}, d_dst_{dst.data().get()}
-  {
-  }
-
-  skip_edge_t(data_t const *src, data_t const *dst) : d_src_{src}, d_dst_{dst} {}
-
-  template <typename idx_type>
-  __device__ auto operator()(idx_type index) const
-  {
-    return d_src_[index] == d_dst_[index];
-  }
-
- private:
-  data_t const *d_src_;
-  data_t const *d_dst_;
-};
-
-template <typename vertex_t, typename data_t>
-struct lookup_by_vertex_id {
- public:
-  lookup_by_vertex_id(data_t const *d_array, vertex_t const *d_vertices, vertex_t base_vertex_id)
-    : d_array_(d_array), d_vertices_(d_vertices), base_vertex_id_(base_vertex_id)
-  {
-  }
-
-  template <typename edge_t>
-  data_t operator() __device__(edge_t edge_id) const
-  {
-    return d_array_[d_vertices_[edge_id] - base_vertex_id_];
-  }
-
- private:
-  data_t const *d_array_;
-  vertex_t const *d_vertices_;
-  vertex_t base_vertex_id_;
-};
-
-template <typename vector_t, typename iterator_t, typename function_t>
-vector_t remove_elements_from_vector(vector_t const &input_v,
-                                     iterator_t iterator_begin,
-                                     iterator_t iterator_end,
-                                     function_t function,
-                                     cudaStream_t stream)
-{
-  vector_t temp_v(input_v.size());
-
-  auto last = thrust::copy_if(
-    rmm::exec_policy(stream)->on(stream), iterator_begin, iterator_end, temp_v.begin(), function);
-
-  temp_v.resize(thrust::distance(temp_v.begin(), last));
-
-  return temp_v;
-}
-
-template <typename vector_t, typename function_t>
-vector_t remove_elements_from_vector(vector_t const &input_v,
-                                     function_t function,
-                                     cudaStream_t stream)
-{
-  return remove_elements_from_vector(input_v, input_v.begin(), input_v.end(), function, stream);
-}
-
-// FIXME:  This should be a generic utility.  The one in cython.cu
-//         is very close to this
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool transposed,
-          bool multi_gpu,
-          typename view_t,
-          std::enable_if_t<multi_gpu> * = nullptr>
-std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
-create_graph(raft::handle_t const &handle,
-             rmm::device_vector<vertex_t> const &src_v,
-             rmm::device_vector<vertex_t> const &dst_v,
-             rmm::device_vector<weight_t> const &weight_v,
-             std::size_t num_local_verts,
-             experimental::graph_properties_t graph_props,
-             view_t const &view)
-{
-  std::vector<experimental::edgelist_t<vertex_t, edge_t, weight_t>> edgelist(
-    {{src_v.data().get(),
-      dst_v.data().get(),
-      weight_v.data().get(),
-      static_cast<edge_t>(src_v.size())}});
-
-  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
-    handle,
-    edgelist,
-    view.get_partition(),
-    num_local_verts,
-    src_v.size(),
-    graph_props,
-    false,
-    false);
-}
-
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool transposed,
-          bool multi_gpu,
-          typename view_t,
-          std::enable_if_t<!multi_gpu> * = nullptr>
-std::unique_ptr<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>
-create_graph(raft::handle_t const &handle,
-             rmm::device_vector<vertex_t> const &src_v,
-             rmm::device_vector<vertex_t> const &dst_v,
-             rmm::device_vector<weight_t> const &weight_v,
-             std::size_t num_local_verts,
-             experimental::graph_properties_t graph_props,
-             view_t const &view)
-{
-  experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{
-    src_v.data().get(),
-    dst_v.data().get(),
-    weight_v.data().get(),
-    static_cast<vertex_t>(src_v.size())};
-
-  return std::make_unique<experimental::graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
-    handle, edgelist, num_local_verts, graph_props, false, false);
-}
-
-}  // namespace detail
-
-//
-// FIXME:  Ultimately, this would be cleaner and more efficient if we did the following:
-//
-//   1) Create an object that does a single level Louvain computation on an input graph
-//      (no graph contraction)
-//   2) Create an object that does graph contraction
-//   3) Create Louvain to use these objects in sequence to compute the aggregate result.
-//
-//  In MNMG-world, the graph contraction step is going to create another graph that likely
-//  fits efficiently in a smaller number of GPUs (eventually one).  Decomposing the algorithm
-//  as above would allow us to eventually run the single GPU version of single level Louvain
-//  on the contracted graphs - which should be more efficient.
-//
-// FIXME: We should return the dendrogram and let the python layer clean it up (or have a
-//  separate C++ function to flatten the dendrogram).  There are customers that might
-//  like the dendrogram and the implementation would be a bit cleaner if we did the
-//  collapsing as a separate step
-//
 template <typename graph_view_type>
 class Louvain {
  public:
@@ -405,52 +68,25 @@ class Louvain {
       handle_(handle),
       dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
-      compute_partition_(graph_view),
       local_num_vertices_(graph_view.get_number_of_local_vertices()),
-      local_num_rows_(graph_view.get_number_of_local_adj_matrix_partition_rows()),
-      local_num_cols_(graph_view.get_number_of_local_adj_matrix_partition_cols()),
-      local_num_edges_(graph_view.get_number_of_edges()),
-      vertex_weights_v_(graph_view.get_number_of_local_vertices()),
-      cluster_weights_v_(graph_view.get_number_of_local_vertices()),
-      number_of_vertices_(graph_view.get_number_of_local_vertices()),
+      vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
+      cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
+      src_vertex_weights_cache_v_(0, handle.get_stream()),
+      src_cluster_weights_cache_v_(0, handle.get_stream()),
+      dst_cluster_weights_cache_v_(0, handle.get_stream()),
+      src_cluster_cache_v_(0, handle.get_stream()),
+      dst_cluster_cache_v_(0, handle.get_stream()),
       stream_(handle.get_stream())
   {
     if (graph_view_t::is_multi_gpu) {
-      rank_               = handle.get_comms().get_rank();
-      base_vertex_id_     = graph_view.get_local_vertex_first();
-      base_src_vertex_id_ = graph_view.get_local_adj_matrix_partition_row_first(0);
-      base_dst_vertex_id_ = graph_view.get_local_adj_matrix_partition_col_first(0);
-
-      local_num_edges_ = thrust::transform_reduce(
-        thrust::host,
-        thrust::make_counting_iterator<size_t>(0),
-        thrust::make_counting_iterator<size_t>(
-          graph_view.get_number_of_local_adj_matrix_partitions()),
-        [&graph_view](auto indx) {
-          return graph_view.get_number_of_local_adj_matrix_partition_edges(indx);
-        },
-        size_t{0},
-        thrust::plus<size_t>());
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-    }
-
-    src_indices_v_.resize(local_num_edges_);
-
-    cugraph::detail::offsets_to_indices(
-      current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get());
-
-    if (base_src_vertex_id_ > 0) {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                        src_indices_v_.begin(),
-                        src_indices_v_.end(),
-                        thrust::make_constant_iterator(base_src_vertex_id_),
-                        src_indices_v_.begin(),
-                        thrust::plus<vertex_t>());
+      rank_           = handle.get_comms().get_rank();
+      base_vertex_id_ = graph_view.get_local_vertex_first();
     }
   }
 
-  Dendrogram<vertex_t> &get_dendrogram() const { return *dendrogram_; }
+  Dendrogram<vertex_t> const &get_dendrogram() const { return *dendrogram_; }
+
+  Dendrogram<vertex_t> &get_dendrogram() { return *dendrogram_; }
 
   std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
 
@@ -458,9 +94,7 @@ class Louvain {
   {
     weight_t best_modularity = weight_t{-1};
 
-#ifdef CUCO_STATIC_MAP_DEFINED
-    weight_t total_edge_weight;
-    total_edge_weight = experimental::transform_reduce_e(
+    weight_t total_edge_weight = experimental::transform_reduce_e(
       handle_,
       current_graph_view_,
       thrust::make_constant_iterator(0),
@@ -486,7 +120,6 @@ class Louvain {
     }
 
     timer_display(std::cout);
-#endif
 
     return best_modularity;
   }
@@ -540,8 +173,8 @@ class Louvain {
     weight_t sum_internal = experimental::transform_reduce_e(
       handle_,
       current_graph_view_,
-      src_cluster_cache_v_.begin(),
-      dst_cluster_cache_v_.begin(),
+      d_src_cluster_cache_,
+      d_dst_cluster_cache_,
       [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) {
         if (src_cluster == nbr_cluster) {
           return wt;
@@ -561,6 +194,10 @@ class Louvain {
   {
     timer_start("compute_vertex_and_cluster_weights");
 
+    //
+    // TODO: Once PR 1394 is merged, this can be replaced by:
+    //    vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
+    //
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
@@ -575,44 +212,67 @@ class Louvain {
                  vertex_weights_v_.end(),
                  cluster_weights_v_.begin());
 
-    cache_vertex_properties(
-      vertex_weights_v_.begin(), src_vertex_weights_cache_v_, dst_vertex_weights_cache_v_);
+    d_src_vertex_weights_cache_ =
+      cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_);
 
-    cache_vertex_properties(
-      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+    std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties(
+      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
 
     timer_stop(stream_);
   }
 
-  template <typename iterator_t, typename T>
-  void cache_vertex_properties(iterator_t const &local_input_iterator,
-                               rmm::device_vector<T> &src_cache_v,
-                               rmm::device_vector<T> &dst_cache_v,
-                               bool src = true,
-                               bool dst = true)
+  template <typename T>
+  T *cache_src_vertex_properties(rmm::device_uvector<T> &input,
+                                   rmm::device_uvector<T> &src_cache_v)
   {
-    if (src) {
-      src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows());
-      copy_to_adj_matrix_row(
-        handle_, current_graph_view_, local_input_iterator, src_cache_v.begin());
+    if (graph_view_t::is_multi_gpu) {
+      src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(),
+                         stream_);
+      copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin());
+      return src_cache_v.begin();
+    } else {
+      return input.begin();
     }
+  }
 
-    if (dst) {
-      dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols());
-      copy_to_adj_matrix_col(
-        handle_, current_graph_view_, local_input_iterator, dst_cache_v.begin());
+  template <typename T>
+  T *cache_dst_vertex_properties(rmm::device_uvector<T> &input,
+                                   rmm::device_uvector<T> &dst_cache_v)
+  {
+    if (graph_view_t::is_multi_gpu) {
+      dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(),
+                         stream_);
+      copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin());
+      return dst_cache_v.begin();
+    } else {
+      return input.begin();
     }
   }
 
-#ifdef CUCO_STATIC_MAP_DEFINED
+  template <typename T>
+  std::tuple<T *, T *> cache_vertex_properties(rmm::device_uvector<T> &input,
+                                               rmm::device_uvector<T> &src_cache_v,
+                                               rmm::device_uvector<T> &dst_cache_v)
+  {
+    auto src = cache_src_vertex_properties(input, src_cache_v);
+    auto dst = cache_dst_vertex_properties(input, dst_cache_v);
+
+    return std::make_tuple(src, dst);
+  }
+
   virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution)
   {
     timer_start("update_clustering");
 
-    rmm::device_vector<vertex_t> next_cluster_v(dendrogram_->current_level_begin(),
-                                                dendrogram_->current_level_end());
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(), stream_);
 
-    cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
+    raft::copy(next_cluster_v.begin(),
+               dendrogram_->current_level_begin(),
+               dendrogram_->current_level_size(),
+               stream_);
+
+    std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) =
+      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -629,8 +289,6 @@ class Louvain {
 
       up_down = !up_down;
 
-      cache_vertex_properties(next_cluster_v.begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
-
       new_Q = modularity(total_edge_weight, resolution);
 
       if (new_Q > cur_Q) {
@@ -641,27 +299,26 @@ class Louvain {
       }
     }
 
-    // cache the final clustering locally on each cpu
-    cache_vertex_properties(
-      dendrogram_->current_level_begin(), src_cluster_cache_v_, dst_cluster_cache_v_);
-
     timer_stop(stream_);
     return cur_Q;
   }
 
   void update_by_delta_modularity(weight_t total_edge_weight,
                                   weight_t resolution,
-                                  rmm::device_vector<vertex_t> &next_cluster_v,
+                                  rmm::device_uvector<vertex_t> &next_cluster_v,
                                   bool up_down)
   {
-    rmm::device_vector<weight_t> old_cluster_sum_v(local_num_vertices_);
-    rmm::device_vector<weight_t> src_old_cluster_sum_cache_v;
+    rmm::device_uvector<weight_t> old_cluster_sum_v(local_num_vertices_, stream_);
+    rmm::device_uvector<weight_t> cluster_subtract_v(local_num_vertices_, stream_);
+
+    rmm::device_uvector<vertex_t> tmp_cluster_keys_v(0, stream_);
+    rmm::device_uvector<weight_t> tmp_cluster_weights_v(0, stream_);
 
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
-      src_cluster_cache_v_.begin(),
-      dst_cluster_cache_v_.begin(),
+      d_src_cluster_cache_,
+      d_dst_cluster_cache_,
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
         if ((src != dst) && (src_cluster == nbr_cluster)) {
           return wt;
@@ -671,549 +328,105 @@ class Louvain {
       weight_t{0},
       old_cluster_sum_v.begin());
 
-    cache_vertex_properties(
-      old_cluster_sum_v.begin(), src_old_cluster_sum_cache_v, empty_cache_weight_v_, true, false);
-
-    detail::src_cluster_equality_comparator_t<vertex_t, edge_t> compare(
-      src_indices_v_.data().get(),
-      current_graph_view_.indices(),
-      dst_cluster_cache_v_.data().get(),
-      base_dst_vertex_id_,
-      std::numeric_limits<edge_t>::max());
-    detail::src_cluster_hasher_t<vertex_t> hasher(src_indices_v_.data().get(),
-                                                  current_graph_view_.indices(),
-                                                  dst_cluster_cache_v_.data().get(),
-                                                  base_dst_vertex_id_);
-    detail::skip_edge_t<vertex_t> skip_edge(src_indices_v_.data().get(),
-                                            current_graph_view_.indices());
-
-    //
-    //  Group edges that lead from same source to same neighboring cluster together
-    //  local_cluster_edge_ids_v will contain edge ids of unique pairs of (src,nbr_cluster).
-    //  If multiple edges exist, one edge id will be chosen (by a parallel race).
-    //  nbr_weights_v will contain the combined weight of all of the edges that connect
-    //  that pair.
-    //
-    rmm::device_vector<edge_t> local_cluster_edge_ids_v;
-    rmm::device_vector<weight_t> nbr_weights_v;
-
-    //
-    //  Perform this combining on the local edges
-    //
-    std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights(
-      hasher, compare, skip_edge, current_graph_view_.weights(), local_num_edges_);
-
-    //
-    //  In order to compute delta_Q for a given src/nbr_cluster pair, I need the following
-    //  information:
-    //       src
-    //       old_cluster - the cluster that src is currently assigned to
-    //       nbr_cluster
-    //       sum of edges going to new cluster
-    //       vertex weight of the src vertex
-    //       sum of edges going to old cluster
-    //       cluster_weights of old cluster
-    //       cluster_weights of nbr_cluster
-    //
-    //  Each GPU has locally cached:
-    //       The sum of edges going to the old cluster (computed from
-    //           experimental::copy_v_transform_reduce_out_nbr call above.
-    //       old_cluster
-    //       nbr_cluster
-    //       vertex weight of src vertex
-    //       partial sum of edges going to the new cluster (in nbr_weights)
-    //
-    //  So the plan is to take the tuple:
-    //      (src, old_cluster, src_vertex_weight, old_cluster_sum, nbr_cluster, nbr_weights)
-    //  and shuffle it around the cluster so that they arrive at the GPU where the pair
-    //  (old_cluster, new_cluster) would be assigned.  Then we can aggregate this information
-    //  and compute the delta_Q values.
-    //
-
-    //
-    //  Define the communication pattern, we're going to send detail
-    //  for edge i to the GPU that is responsible for the vertex
-    //  pair (cluster[src[i]], cluster[dst[i]])
-    //
-    auto communication_schedule = thrust::make_transform_iterator(
-      local_cluster_edge_ids_v.begin(),
-      [d_edge_device_view = compute_partition_.edge_device_view(),
-       d_src_indices      = src_indices_v_.data().get(),
-       d_src_cluster      = src_cluster_cache_v_.data().get(),
-       d_dst_indices      = current_graph_view_.indices(),
-       d_dst_cluster      = dst_cluster_cache_v_.data().get(),
-       base_src_vertex_id = base_src_vertex_id_,
-       base_dst_vertex_id = base_dst_vertex_id_] __device__(edge_t edge_id) {
-        return d_edge_device_view(d_src_cluster[d_src_indices[edge_id] - base_src_vertex_id],
-                                  d_dst_cluster[d_dst_indices[edge_id] - base_dst_vertex_id]);
-      });
-
-    // FIXME:  This should really be a variable_shuffle of a tuple, for time
-    //         reasons I'm just doing 6 independent shuffles.
-    //
-    rmm::device_vector<weight_t> ocs_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_transform_iterator(
-        local_cluster_edge_ids_v.begin(),
-        detail::lookup_by_vertex_id<vertex_t, weight_t>(src_old_cluster_sum_cache_v.data().get(),
-                                                        src_indices_v_.data().get(),
-                                                        base_src_vertex_id_)),
-      communication_schedule);
-
-    rmm::device_vector<vertex_t> src_cluster_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, vertex_t>(
-            src_cluster_cache_v_.data().get(), src_indices_v_.data().get(), base_src_vertex_id_)),
-        communication_schedule);
-
-    rmm::device_vector<weight_t> src_vertex_weight_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, weight_t>(src_vertex_weights_cache_v_.data().get(),
-                                                          src_indices_v_.data().get(),
-                                                          base_src_vertex_id_)),
-        communication_schedule);
-
-    rmm::device_vector<vertex_t> src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
+    experimental::copy_v_transform_reduce_out_nbr(
       handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(src_indices_v_.begin(), local_cluster_edge_ids_v.begin()),
-      communication_schedule);
-
-    rmm::device_vector<vertex_t> nbr_cluster_v =
-      variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-        handle_,
-        local_cluster_edge_ids_v.size(),
-        thrust::make_transform_iterator(
-          local_cluster_edge_ids_v.begin(),
-          detail::lookup_by_vertex_id<vertex_t, vertex_t>(
-            dst_cluster_cache_v_.data().get(), current_graph_view_.indices(), base_dst_vertex_id_)),
-        communication_schedule);
-
-    nbr_weights_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_, nbr_weights_v.size(), nbr_weights_v.begin(), communication_schedule);
-
-    //
-    //  At this point, src_v, nbr_cluster_v and nbr_weights_v have been
-    //  shuffled to the correct GPU.  We can now compute the final
-    //  value of delta_Q for each neigboring cluster
-    //
-    //  Again, we'll combine edges that connect the same source to the same
-    //  neighboring cluster and sum their weights.
-    //
-    detail::src_dst_equality_comparator_t<vertex_t, vertex_t> compare2(
-      src_v, nbr_cluster_v, std::numeric_limits<vertex_t>::max());
-    detail::src_dst_hasher_t<vertex_t> hasher2(src_v, nbr_cluster_v);
-
-    auto skip_edge2 = [] __device__(auto) { return false; };
-
-    std::tie(local_cluster_edge_ids_v, nbr_weights_v) = combine_local_src_nbr_cluster_weights(
-      hasher2, compare2, skip_edge2, nbr_weights_v.data().get(), src_v.size());
-
-    //
-    //  Now local_cluster_edge_ids_v contains the edge ids of the src id/dest
-    //  cluster id pairs, and nbr_weights_v contains the weight of edges
-    //  going to that cluster id
-    //
-    //  Now we can compute (locally) each delta_Q value
-    //
-    auto iter = thrust::make_zip_iterator(
-      thrust::make_tuple(local_cluster_edge_ids_v.begin(), nbr_weights_v.begin()));
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      iter,
-                      iter + local_cluster_edge_ids_v.size(),
-                      nbr_weights_v.begin(),
-                      [total_edge_weight,
-                       resolution,
-                       d_src                 = src_v.data().get(),
-                       d_src_cluster         = src_cluster_v.data().get(),
-                       d_nbr_cluster         = nbr_cluster_v.data().get(),
-                       d_src_vertex_weights  = src_vertex_weight_v.data().get(),
-                       d_src_cluster_weights = src_cluster_weights_cache_v_.data().get(),
-                       d_dst_cluster_weights = dst_cluster_weights_cache_v_.data().get(),
-                       d_ocs                 = ocs_v.data().get(),
-                       base_src_vertex_id    = base_src_vertex_id_,
-                       base_dst_vertex_id    = base_dst_vertex_id_] __device__(auto tuple) {
-                        edge_t edge_id           = thrust::get<0>(tuple);
-                        vertex_t nbr_cluster     = d_nbr_cluster[edge_id];
-                        weight_t new_cluster_sum = thrust::get<1>(tuple);
-                        vertex_t old_cluster     = d_src_cluster[edge_id];
-                        weight_t k_k             = d_src_vertex_weights[edge_id];
-                        weight_t old_cluster_sum = d_ocs[edge_id];
-
-                        weight_t a_old = d_src_cluster_weights[old_cluster - base_src_vertex_id];
-                        weight_t a_new = d_dst_cluster_weights[nbr_cluster - base_dst_vertex_id];
-
-                        return 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
-                                    resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
-                                      (total_edge_weight * total_edge_weight));
-                      });
-
-    //
-    //  Pick the largest delta_Q value for each vertex on this gpu.
-    //  Then we will shuffle back to the gpu by vertex id
-    //
-    rmm::device_vector<vertex_t> final_src_v(local_cluster_edge_ids_v.size());
-    rmm::device_vector<vertex_t> final_nbr_cluster_v(local_cluster_edge_ids_v.size());
-    rmm::device_vector<weight_t> final_nbr_weights_v(local_cluster_edge_ids_v.size());
-
-    auto final_input_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      thrust::make_permutation_iterator(src_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_permutation_iterator(nbr_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-      nbr_weights_v.begin()));
-
-    auto final_output_iter = thrust::make_zip_iterator(thrust::make_tuple(
-      final_src_v.begin(), final_nbr_cluster_v.begin(), final_nbr_weights_v.begin()));
-
-    auto final_output_pos =
-      thrust::copy_if(rmm::exec_policy(stream_)->on(stream_),
-                      final_input_iter,
-                      final_input_iter + local_cluster_edge_ids_v.size(),
-                      final_output_iter,
-                      [] __device__(auto p) { return (thrust::get<2>(p) > weight_t{0}); });
-
-    final_src_v.resize(thrust::distance(final_output_iter, final_output_pos));
-    final_nbr_cluster_v.resize(thrust::distance(final_output_iter, final_output_pos));
-    final_nbr_weights_v.resize(thrust::distance(final_output_iter, final_output_pos));
-
-    //
-    // Sort the results, pick the largest version
-    //
-    thrust::sort(rmm::exec_policy(stream_)->on(stream_),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())),
-                 [] __device__(auto left, auto right) {
-                   if (thrust::get<0>(left) < thrust::get<0>(right)) return true;
-                   if (thrust::get<0>(left) > thrust::get<0>(right)) return false;
-                   if (thrust::get<1>(left) > thrust::get<1>(right)) return true;
-                   if (thrust::get<1>(left) < thrust::get<1>(right)) return false;
-                   return (thrust::get<2>(left) < thrust::get<2>(right));
-                 });
-
-    //
-    //  Now that we're sorted the first entry for each src value is the largest.
-    //
-    local_cluster_edge_ids_v.resize(final_src_v.size());
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      thrust::make_counting_iterator<edge_t>(0),
-                      thrust::make_counting_iterator<edge_t>(final_src_v.size()),
-                      local_cluster_edge_ids_v.begin(),
-                      [sentinel = std::numeric_limits<edge_t>::max(),
-                       d_src    = final_src_v.data().get()] __device__(edge_t edge_id) {
-                        if (edge_id == 0) { return edge_id; }
-
-                        if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; }
-
-                        return sentinel;
-                      });
-
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      [sentinel = std::numeric_limits<edge_t>::max()] __device__(auto edge_id) {
-        return (edge_id != sentinel);
+      current_graph_view_,
+      d_src_cluster_cache_,
+      d_dst_cluster_cache_,
+      [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
+        return (src == dst) ? wt : weight_t{0};
       },
-      stream_);
-
-    final_nbr_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_nbr_weights_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_weights_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_src_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_src_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    //
-    //  At this point...
-    //     final_src_v contains the source indices
-    //     final_nbr_cluster_v contains the neighboring clusters
-    //     final_nbr_weights_v contains delta_Q for moving src to the neighboring
-    //
-    //  They have been shuffled to the gpus responsible for their source vertex
-    //
-    //  FIXME:  Think about how this should work.
-    //          I think Leiden is broken.  I don't think that the code we have
-    //          actually does anything.  For now I'm going to ignore Leiden in
-    //          MNMG, we can reconsider this later.
-    //
-    //  If we ignore Leiden, I'd like to think about whether the reduction
-    //  should occur now...
-    //
-
-    //
-    // Sort the results, pick the largest version
-    //
-    thrust::sort(rmm::exec_policy(stream_)->on(stream_),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.begin(), final_nbr_weights_v.begin(), final_nbr_cluster_v.begin())),
-                 thrust::make_zip_iterator(thrust::make_tuple(
-                   final_src_v.end(), final_nbr_weights_v.end(), final_nbr_cluster_v.begin())),
-                 [] __device__(auto left, auto right) {
-                   if (thrust::get<0>(left) < thrust::get<0>(right)) return true;
-                   if (thrust::get<0>(left) > thrust::get<0>(right)) return false;
-                   if (thrust::get<1>(left) > thrust::get<1>(right)) return true;
-                   if (thrust::get<1>(left) < thrust::get<1>(right)) return false;
-                   return (thrust::get<2>(left) < thrust::get<2>(right));
-                 });
-
-    //
-    //  Now that we're sorted (ascending), the last entry for each src value is the largest.
-    //
-    local_cluster_edge_ids_v.resize(final_src_v.size());
-
-    thrust::transform(rmm::exec_policy(stream_)->on(stream_),
-                      thrust::make_counting_iterator<edge_t>(0),
-                      thrust::make_counting_iterator<edge_t>(final_src_v.size()),
-                      local_cluster_edge_ids_v.begin(),
-                      [sentinel = std::numeric_limits<edge_t>::max(),
-                       d_src    = final_src_v.data().get()] __device__(edge_t edge_id) {
-                        if (edge_id == 0) { return edge_id; }
-
-                        if (d_src[edge_id - 1] != d_src[edge_id]) { return edge_id; }
+      weight_t{0},
+      cluster_subtract_v.begin());
 
-                        return sentinel;
-                      });
+    auto output_buffer =
+      cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
+        local_num_vertices_, stream_);
 
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      [sentinel = std::numeric_limits<edge_t>::max()] __device__(auto edge_id) {
-        return (edge_id != sentinel);
+    copy_v_transform_reduce_key_aggregated_out_nbr(
+      handle_,
+      current_graph_view_,
+      thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(),
+                                                   d_src_vertex_weights_cache_,
+                                                   cluster_subtract_v.begin(),
+                                                   d_src_cluster_cache_)),
+
+      d_dst_cluster_cache_,
+      thrust::make_counting_iterator<vertex_t>(base_vertex_id_),
+      thrust::make_counting_iterator<vertex_t>(base_vertex_id_ + local_num_vertices_),
+      d_dst_cluster_weights_cache_,
+      [base_vertex_id        = base_vertex_id_,
+       d_src_cluster_weights = d_src_cluster_weights_cache_,
+       total_edge_weight,
+       resolution] __device__(auto src,
+                              auto neighbor_cluster,
+                              auto new_cluster_sum,
+                              auto src_info,
+                              auto a_new) {
+        auto old_cluster_sum  = thrust::get<0>(src_info);
+        auto k_k              = thrust::get<1>(src_info);
+        auto cluster_subtract = thrust::get<2>(src_info);
+        auto src_cluster      = thrust::get<3>(src_info);
+        auto a_old            = d_src_cluster_weights[src_cluster];
+
+        if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
+
+        weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
+                                         resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
+                                           (total_edge_weight * total_edge_weight));
+
+        return thrust::make_tuple(neighbor_cluster, delta_modularity);
       },
-      stream_);
-
-    rmm::device_vector<weight_t> cluster_increase_v(final_src_v.size());
-    rmm::device_vector<weight_t> cluster_decrease_v(final_src_v.size());
-    rmm::device_vector<vertex_t> old_cluster_v(final_src_v.size());
-
-    //
-    //   Then we can, on each gpu, do a local assignment for all of the
-    //   vertices assigned to that gpu using the up_down logic
-    //
-    local_cluster_edge_ids_v = detail::remove_elements_from_vector(
-      local_cluster_edge_ids_v,
-      local_cluster_edge_ids_v.begin(),
-      local_cluster_edge_ids_v.end(),
-      [d_final_src         = final_src_v.data().get(),
-       d_final_nbr_cluster = final_nbr_cluster_v.data().get(),
-       d_final_nbr_weights = final_nbr_weights_v.data().get(),
-       d_cluster_increase  = cluster_increase_v.data().get(),
-       d_cluster_decrease  = cluster_decrease_v.data().get(),
-       d_vertex_weights    = src_vertex_weights_cache_v_.data().get(),
-       d_next_cluster      = next_cluster_v.data().get(),
-       d_old_cluster       = old_cluster_v.data().get(),
-       base_vertex_id      = base_vertex_id_,
-       base_src_vertex_id  = base_src_vertex_id_,
-       up_down] __device__(edge_t idx) {
-        vertex_t src         = d_final_src[idx];
-        vertex_t new_cluster = d_final_nbr_cluster[idx];
-        vertex_t old_cluster = d_next_cluster[src - base_vertex_id];
-        weight_t src_weight  = d_vertex_weights[src - base_src_vertex_id];
-
-        if (d_final_nbr_weights[idx] <= weight_t{0}) return false;
-        if (new_cluster == old_cluster) return false;
-        if ((new_cluster > old_cluster) != up_down) return false;
-
-        d_next_cluster[src - base_vertex_id] = new_cluster;
-        d_cluster_increase[idx]              = src_weight;
-        d_cluster_decrease[idx]              = src_weight;
-        d_old_cluster[idx]                   = old_cluster;
-        return true;
+      [] __device__(auto p1, auto p2) {
+        return (thrust::get<1>(p1) < thrust::get<1>(p2)) ? p2 : p1;
       },
-      stream_);
+      thrust::make_tuple(vertex_t{-1}, weight_t{0}),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
+        output_buffer));
 
-    cluster_increase_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(cluster_increase_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                          local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    final_nbr_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(final_nbr_cluster_v.begin(),
-                                          local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    cluster_decrease_v = variable_shuffle<graph_view_t::is_multi_gpu, weight_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(cluster_decrease_v.begin(),
-                                        local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    old_cluster_v = variable_shuffle<graph_view_t::is_multi_gpu, vertex_t>(
-      handle_,
-      local_cluster_edge_ids_v.size(),
-      thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-      thrust::make_transform_iterator(
-        thrust::make_permutation_iterator(old_cluster_v.begin(), local_cluster_edge_ids_v.begin()),
-        [d_vertex_device_view = compute_partition_.vertex_device_view()] __device__(vertex_t v) {
-          return d_vertex_device_view(v);
-        }));
-
-    thrust::for_each(rmm::exec_policy(stream_)->on(stream_),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(final_nbr_cluster_v.begin(), cluster_increase_v.begin())),
-                     thrust::make_zip_iterator(
-                       thrust::make_tuple(final_nbr_cluster_v.end(), cluster_increase_v.end())),
-                     [d_cluster_weights = cluster_weights_v_.data().get(),
-                      base_vertex_id    = base_vertex_id_] __device__(auto p) {
-                       vertex_t cluster_id = thrust::get<0>(p);
-                       weight_t weight     = thrust::get<1>(p);
-
-                       atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, weight);
-                     });
-
-    thrust::for_each(
+    thrust::transform(
       rmm::exec_policy(stream_)->on(stream_),
-      thrust::make_zip_iterator(
-        thrust::make_tuple(old_cluster_v.begin(), cluster_decrease_v.begin())),
-      thrust::make_zip_iterator(thrust::make_tuple(old_cluster_v.end(), cluster_decrease_v.end())),
-      [d_cluster_weights = cluster_weights_v_.data().get(),
-       base_vertex_id    = base_vertex_id_] __device__(auto p) {
-        vertex_t cluster_id = thrust::get<0>(p);
-        weight_t weight     = thrust::get<1>(p);
-
-        atomicAdd(d_cluster_weights + cluster_id - base_vertex_id, -weight);
+      next_cluster_v.begin(),
+      next_cluster_v.end(),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
+        output_buffer),
+      next_cluster_v.begin(),
+      [up_down] __device__(vertex_t old_cluster, auto p) {
+        vertex_t new_cluster      = thrust::get<0>(p);
+        weight_t delta_modularity = thrust::get<1>(p);
+
+        return (delta_modularity > weight_t{0})
+                 ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
+                 : old_cluster;
       });
 
-    cache_vertex_properties(
-      cluster_weights_v_.begin(), src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
-  }
-
-  template <typename hash_t, typename compare_t, typename skip_edge_t, typename count_t>
-  std::pair<rmm::device_vector<count_t>, rmm::device_vector<weight_t>>
-  combine_local_src_nbr_cluster_weights(hash_t hasher,
-                                        compare_t compare,
-                                        skip_edge_t skip_edge,
-                                        weight_t const *d_weights,
-                                        count_t num_weights)
-  {
-    rmm::device_vector<count_t> relevant_edges_v;
-    rmm::device_vector<weight_t> relevant_edge_weights_v;
-
-    if (num_weights > 0) {
-      std::size_t capacity{static_cast<std::size_t>(num_weights / 0.7)};
-
-      cuco::static_map<count_t, count_t> hash_map(
-        capacity, std::numeric_limits<count_t>::max(), count_t{0});
-      detail::create_cuco_pair_t<count_t> create_cuco_pair;
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-
-      hash_map.insert(thrust::make_transform_iterator(thrust::make_counting_iterator<count_t>(0),
-                                                      create_cuco_pair),
-                      thrust::make_transform_iterator(
-                        thrust::make_counting_iterator<count_t>(num_weights), create_cuco_pair),
-                      hasher,
-                      compare);
-
-      CUDA_TRY(cudaStreamSynchronize(stream_));
-
-      relevant_edges_v.resize(num_weights);
-
-      relevant_edges_v = detail::remove_elements_from_vector(
-        relevant_edges_v,
-        thrust::make_counting_iterator<count_t>(0),
-        thrust::make_counting_iterator<count_t>(num_weights),
-        [d_hash_map = hash_map.get_device_view(), hasher, compare] __device__(count_t idx) {
-          auto pos = d_hash_map.find(idx, hasher, compare);
-          return (pos->first == idx);
-        },
-        stream_);
-
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_counting_iterator<count_t>(0),
-        relevant_edges_v.size(),
-        [d_hash_map = hash_map.get_device_view(),
-         hasher,
-         compare,
-         d_relevant_edges = relevant_edges_v.data().get()] __device__(count_t idx) mutable {
-          count_t edge_id = d_relevant_edges[idx];
-          auto pos        = d_hash_map.find(edge_id, hasher, compare);
-          pos->second.store(idx);
-        });
-
-      relevant_edge_weights_v.resize(relevant_edges_v.size());
-      thrust::fill(rmm::exec_policy(stream_)->on(stream_),
-                   relevant_edge_weights_v.begin(),
-                   relevant_edge_weights_v.end(),
-                   weight_t{0});
-
-      thrust::for_each_n(
-        rmm::exec_policy(stream_)->on(stream_),
-        thrust::make_counting_iterator<count_t>(0),
-        num_weights,
-        [d_hash_map = hash_map.get_device_view(),
-         hasher,
-         compare,
-         skip_edge,
-         d_relevant_edge_weights = relevant_edge_weights_v.data().get(),
-         d_weights] __device__(count_t idx) {
-          if (!skip_edge(idx)) {
-            auto pos = d_hash_map.find(idx, hasher, compare);
-            if (pos != d_hash_map.end()) {
-              atomicAdd(d_relevant_edge_weights + pos->second.load(cuda::std::memory_order_relaxed),
-                        d_weights[idx]);
-            }
-          }
-        });
-    }
+    std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) =
+      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
 
-    return std::make_pair(relevant_edges_v, relevant_edge_weights_v);
+    std::tie(tmp_cluster_keys_v, tmp_cluster_weights_v) =
+      cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e(
+        handle_,
+        current_graph_view_,
+        thrust::make_constant_iterator(0),
+        thrust::make_constant_iterator(0),
+        d_src_cluster_cache_,
+        [] __device__(auto src, auto, auto wt, auto, auto) { return wt; },
+        weight_t{0});
+
+    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
+                 cluster_weights_v_.begin(),
+                 cluster_weights_v_.end(),
+                 weight_t{0});
+
+    thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
+                    tmp_cluster_weights_v.begin(),
+                    tmp_cluster_weights_v.end(),
+                    tmp_cluster_keys_v.begin(),
+                    cluster_weights_v_.begin());
+
+    std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties(
+      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
   }
-#endif
 
   void shrink_graph()
   {
@@ -1227,26 +440,8 @@ class Louvain {
     current_graph_view_ = current_graph_->view();
 
     local_num_vertices_ = current_graph_view_.get_number_of_local_vertices();
-    local_num_rows_     = current_graph_view_.get_number_of_local_adj_matrix_partition_rows();
-    local_num_cols_     = current_graph_view_.get_number_of_local_adj_matrix_partition_cols();
     base_vertex_id_     = current_graph_view_.get_local_vertex_first();
 
-    local_num_edges_ = thrust::transform_reduce(
-      thrust::host,
-      thrust::make_counting_iterator<size_t>(0),
-      thrust::make_counting_iterator<size_t>(
-        current_graph_view_.get_number_of_local_adj_matrix_partitions()),
-      [this](auto indx) {
-        return current_graph_view_.get_number_of_local_adj_matrix_partition_edges(indx);
-      },
-      size_t{0},
-      thrust::plus<size_t>());
-
-    src_indices_v_.resize(local_num_edges_);
-
-    cugraph::detail::offsets_to_indices(
-      current_graph_view_.offsets(), local_num_rows_, src_indices_v_.data().get());
-
     rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), stream_);
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      numbering_indices.begin(),
@@ -1270,45 +465,29 @@ class Louvain {
 
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
 
-  vertex_t number_of_vertices_;
+  vertex_t local_num_vertices_;
   vertex_t base_vertex_id_{0};
-  vertex_t base_src_vertex_id_{0};
-  vertex_t base_dst_vertex_id_{0};
   int rank_{0};
 
-  vertex_t local_num_vertices_;
-  vertex_t local_num_rows_;
-  vertex_t local_num_cols_;
-  edge_t local_num_edges_;
-
   //
   //  Copy of graph
   //
   std::unique_ptr<graph_t> current_graph_{};
   graph_view_t current_graph_view_;
 
-  //
-  //  For partitioning
-  //
-  detail::compute_partition_t<graph_view_t> compute_partition_;
-
-  rmm::device_vector<vertex_t> src_indices_v_;
-
-  //
-  //  Weights and clustering across iterations of algorithm
-  //
-  rmm::device_vector<weight_t> vertex_weights_v_;
-  rmm::device_vector<weight_t> src_vertex_weights_cache_v_{};
-  rmm::device_vector<weight_t> dst_vertex_weights_cache_v_{};
-
-  rmm::device_vector<weight_t> cluster_weights_v_;
-  rmm::device_vector<weight_t> src_cluster_weights_cache_v_{};
-  rmm::device_vector<weight_t> dst_cluster_weights_cache_v_{};
-
-  rmm::device_vector<vertex_t> src_cluster_cache_v_{};
-  rmm::device_vector<vertex_t> dst_cluster_cache_v_{};
-
-  rmm::device_vector<weight_t> empty_cache_weight_v_{};
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
+  rmm::device_uvector<weight_t> src_vertex_weights_cache_v_;
+  rmm::device_uvector<weight_t> src_cluster_weights_cache_v_;
+  rmm::device_uvector<weight_t> dst_cluster_weights_cache_v_;
+  rmm::device_uvector<vertex_t> src_cluster_cache_v_;
+  rmm::device_uvector<vertex_t> dst_cluster_cache_v_;
+
+  weight_t *d_src_vertex_weights_cache_;
+  weight_t *d_src_cluster_weights_cache_;
+  weight_t *d_dst_cluster_weights_cache_;
+  vertex_t *d_src_cluster_cache_;
+  vertex_t *d_dst_cluster_cache_;
 
 #ifdef TIMING
   HighResTimer hr_timer_;

From e1581d7a56f0319738c9037dedfa613299c4ffd3 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Mon, 22 Feb 2021 20:08:40 -0500
Subject: [PATCH 02/18] clean up include files, delete obsolete header

---
 cpp/src/experimental/louvain.cuh |  27 ++--
 cpp/src/experimental/shuffle.cuh | 226 -------------------------------
 2 files changed, 8 insertions(+), 245 deletions(-)
 delete mode 100644 cpp/src/experimental/shuffle.cuh

diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 3597923d6b9..5e70d32ac94 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -15,18 +15,11 @@
  */
 #pragma once
 
-#include <thrust/binary_search.h>
+#include <community/dendrogram.cuh>
 
 #include <experimental/graph.hpp>
-
-#include <rmm/thrust_rmm_allocator.h>
-#include <compute_partition.cuh>
-#include <experimental/shuffle.cuh>
-#include <utilities/graph_utils.cuh>
-
-#include <raft/device_atomics.cuh>
-
 #include <experimental/graph_functions.hpp>
+
 #include <patterns/copy_to_adj_matrix_row_col.cuh>
 #include <patterns/copy_v_transform_reduce_in_out_nbr.cuh>
 #include <patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
@@ -34,10 +27,6 @@
 #include <patterns/transform_reduce_e.cuh>
 #include <patterns/transform_reduce_v.cuh>
 
-#include <community/dendrogram.cuh>
-
-#include <numeric>
-
 //#define TIMING
 
 #ifdef TIMING
@@ -222,8 +211,7 @@ class Louvain {
   }
 
   template <typename T>
-  T *cache_src_vertex_properties(rmm::device_uvector<T> &input,
-                                   rmm::device_uvector<T> &src_cache_v)
+  T *cache_src_vertex_properties(rmm::device_uvector<T> &input, rmm::device_uvector<T> &src_cache_v)
   {
     if (graph_view_t::is_multi_gpu) {
       src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(),
@@ -236,8 +224,7 @@ class Louvain {
   }
 
   template <typename T>
-  T *cache_dst_vertex_properties(rmm::device_uvector<T> &input,
-                                   rmm::device_uvector<T> &dst_cache_v)
+  T *cache_dst_vertex_properties(rmm::device_uvector<T> &input, rmm::device_uvector<T> &dst_cache_v)
   {
     if (graph_view_t::is_multi_gpu) {
       dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(),
@@ -470,7 +457,9 @@ class Louvain {
   int rank_{0};
 
   //
-  //  Copy of graph
+  //  Initially we run on the input graph view,
+  //  but as we shrink the graph we'll keep the
+  //  current graph here
   //
   std::unique_ptr<graph_t> current_graph_{};
   graph_view_t current_graph_view_;
@@ -492,7 +481,7 @@ class Louvain {
 #ifdef TIMING
   HighResTimer hr_timer_;
 #endif
-};  // namespace experimental
+};
 
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/experimental/shuffle.cuh b/cpp/src/experimental/shuffle.cuh
deleted file mode 100644
index 40f3b510b10..00000000000
--- a/cpp/src/experimental/shuffle.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <raft/comms/comms.hpp>
-#include <raft/device_atomics.cuh>
-
-namespace cugraph {
-namespace experimental {
-
-namespace detail {
-
-//
-// FIXME:   This implementation of variable_shuffle stages the data for transfer
-//          in host memory.  It would be more efficient, I believe, to stage the
-//          data in device memory, but it would require actually instantiating
-//          the data in device memory which is already precious in the Louvain
-//          implementation.  We should explore if it's actually more efficient
-//          through device memory and whether the improvement is worth the extra
-//          memory required.
-//
-template <typename data_t, typename iterator_t, typename partition_iter_t>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  //
-  // We need to compute the size of data movement
-  //
-  raft::comms::comms_t const &comms = handle.get_comms();
-
-  cudaStream_t stream = handle.get_stream();
-  int num_gpus        = comms.get_size();
-  int my_gpu          = comms.get_rank();
-
-  rmm::device_vector<size_t> local_sizes_v(num_gpus, size_t{0});
-
-  thrust::for_each(rmm::exec_policy(stream)->on(stream),
-                   partition_iter,
-                   partition_iter + n_elements,
-                   [num_gpus, d_local_sizes = local_sizes_v.data().get()] __device__(auto p) {
-                     atomicAdd(d_local_sizes + p, size_t{1});
-                   });
-
-  std::vector<size_t> h_local_sizes_v(num_gpus);
-  std::vector<size_t> h_global_sizes_v(num_gpus);
-  std::vector<data_t> h_input_v(n_elements);
-  std::vector<int32_t> h_partitions_v(n_elements);
-
-  thrust::copy(local_sizes_v.begin(), local_sizes_v.end(), h_local_sizes_v.begin());
-  thrust::copy(partition_iter, partition_iter + n_elements, h_partitions_v.begin());
-
-  std::vector<raft::comms::request_t> requests(2 * num_gpus);
-
-  int request_pos = 0;
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    if (gpu != my_gpu) {
-      comms.irecv(&h_global_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]);
-      ++request_pos;
-      comms.isend(&h_local_sizes_v[gpu], 1, gpu, 0, &requests[request_pos]);
-      ++request_pos;
-    } else {
-      h_global_sizes_v[gpu] = h_local_sizes_v[gpu];
-    }
-  }
-
-  if (request_pos > 0) { comms.waitall(request_pos, requests.data()); }
-
-  comms.barrier();
-
-  //
-  //  Now global_sizes contains all of the counts, we need to
-  //  allocate an array of the appropriate size
-  //
-  int64_t receive_size =
-    thrust::reduce(thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end());
-
-  std::vector<data_t> temp_data;
-
-  if (receive_size > 0) temp_data.resize(receive_size);
-
-  rmm::device_vector<data_t> input_v(n_elements);
-
-  auto input_start = input_v.begin();
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    input_start = thrust::copy_if(rmm::exec_policy(stream)->on(stream),
-                                  data_iter,
-                                  data_iter + n_elements,
-                                  partition_iter,
-                                  input_start,
-                                  [gpu] __device__(int32_t p) { return p == gpu; });
-  }
-
-  thrust::copy(input_v.begin(), input_v.end(), h_input_v.begin());
-
-  std::vector<size_t> temp_v(num_gpus + 1);
-
-  thrust::exclusive_scan(
-    thrust::host, h_global_sizes_v.begin(), h_global_sizes_v.end(), temp_v.begin());
-
-  temp_v[num_gpus] = temp_v[num_gpus - 1] + h_global_sizes_v[num_gpus - 1];
-  h_global_sizes_v = temp_v;
-
-  thrust::exclusive_scan(
-    thrust::host, h_local_sizes_v.begin(), h_local_sizes_v.end(), temp_v.begin());
-
-  temp_v[num_gpus] = temp_v[num_gpus - 1] + h_local_sizes_v[num_gpus - 1];
-  h_local_sizes_v  = temp_v;
-
-  CUDA_TRY(cudaStreamSynchronize(handle.get_stream()));
-  comms.barrier();
-
-  request_pos = 0;
-
-  for (int gpu = 0; gpu < num_gpus; ++gpu) {
-    size_t to_receive = h_global_sizes_v[gpu + 1] - h_global_sizes_v[gpu];
-    size_t to_send    = h_local_sizes_v[gpu + 1] - h_local_sizes_v[gpu];
-
-    if (gpu != my_gpu) {
-      if (to_receive > 0) {
-        comms.irecv(
-          temp_data.data() + h_global_sizes_v[gpu], to_receive, gpu, 0, &requests[request_pos]);
-        ++request_pos;
-      }
-
-      if (to_send > 0) {
-        comms.isend(
-          h_input_v.data() + h_local_sizes_v[gpu], to_send, gpu, 0, &requests[request_pos]);
-        ++request_pos;
-      }
-    } else if (to_receive > 0) {
-      std::copy(h_input_v.begin() + h_local_sizes_v[gpu],
-                h_input_v.begin() + h_local_sizes_v[gpu + 1],
-                temp_data.begin() + h_global_sizes_v[gpu]);
-    }
-  }
-
-  comms.barrier();
-
-  if (request_pos > 0) { comms.waitall(request_pos, requests.data()); }
-
-  comms.barrier();
-
-  return rmm::device_vector<data_t>(temp_data);
-}
-
-}  // namespace detail
-
-/**
- * @brief shuffle data to the desired partition
- *
- * MNMG algorithms require shuffling data between partitions
- * to get the data to the right location for computation.
- *
- * This function operates dynamically, there is no
- * a priori knowledge about where the data will need
- * to be transferred.
- *
- * This function will be executed on each GPU.  Each gpu
- * has a portion of the data (specified by begin_data and
- * end_data iterators) and an iterator that identifies
- * (for each corresponding element) which GPU the data
- * should be shuffled to.
- *
- * The return value will be a device vector containing
- * the data received by this GPU.
- *
- * Note that this function accepts iterators as input.
- * `partition_iterator` will be traversed multiple times.
- *
- * @tparam is_multi_gpu     If true, multi-gpu - shuffle will occur
- *                          If false, single GPU - simple copy will occur
- * @tparam data_t           Type of the data being shuffled
- * @tparam iterator_t       Iterator referencing data to be shuffled
- * @tparam partition_iter_t Iterator identifying the destination partition
- *
- * @param  handle         Library handle (RAFT)
- * @param  n_elements     Number of elements to transfer
- * @param  data_iter      Iterator that returns the elements to be transfered
- * @param  partition_iter Iterator that returns the partition where elements
- *                        should be transfered.
- */
-template <bool is_multi_gpu,
-          typename data_t,
-          typename iterator_t,
-          typename partition_iter_t,
-          typename std::enable_if_t<is_multi_gpu> * = nullptr>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  return detail::variable_shuffle<data_t>(handle, n_elements, data_iter, partition_iter);
-}
-
-template <bool is_multi_gpu,
-          typename data_t,
-          typename iterator_t,
-          typename partition_iter_t,
-          typename std::enable_if_t<!is_multi_gpu> * = nullptr>
-rmm::device_vector<data_t> variable_shuffle(raft::handle_t const &handle,
-                                            std::size_t n_elements,
-                                            iterator_t data_iter,
-                                            partition_iter_t partition_iter)
-{
-  return rmm::device_vector<data_t>(data_iter, data_iter + n_elements);
-}
-
-}  // namespace experimental
-}  // namespace cugraph

From 11aeffa34b2760b4837fcec5da851502ff7782f7 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Feb 2021 09:44:18 -0500
Subject: [PATCH 03/18] fix weight use in
 copy_v_transform_reduce_key_aggregated_out_nbr

---
 ...opy_v_transform_reduce_key_aggregated_out_nbr.cuh | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 8490df1d17d..35938bad7cb 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -88,16 +88,17 @@ __global__ void for_all_major_for_all_nbr_low_degree(
       // in-place reduce_by_key
       vertex_t key_idx{0};
       key_aggregated_edge_weights[local_offset + key_idx] =
-        weights != nullptr ? weights[0] : weight_t{1.0};
+        weights != nullptr ? key_aggregated_edge_weights[local_offset] : weight_t{1.0};
+
       for (edge_t i = 1; i < local_degree; ++i) {
         if (minor_keys[local_offset + i] == minor_keys[local_offset + key_idx]) {
           key_aggregated_edge_weights[local_offset + key_idx] +=
-            weights != nullptr ? weights[i] : weight_t{1.0};
+            weights != nullptr ? key_aggregated_edge_weights[local_offset + i] : weight_t{1.0};
         } else {
           ++key_idx;
           minor_keys[local_offset + key_idx] = minor_keys[local_offset + i];
           key_aggregated_edge_weights[local_offset + key_idx] =
-            weights != nullptr ? weights[i] : weight_t{1.0};
+            weights != nullptr ? key_aggregated_edge_weights[local_offset + i] : weight_t{1.0};
         }
       }
       thrust::fill(thrust::seq,
@@ -170,6 +171,7 @@ __global__ void for_all_major_for_all_nbr_low_degree(
 template <typename GraphViewType,
           typename AdjMatrixRowValueInputIterator,
           typename VertexIterator,
+          typename VertexIterator2,
           typename ValueIterator,
           typename KeyAggregatedEdgeOp,
           typename ReduceOp,
@@ -180,8 +182,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
   GraphViewType const& graph_view,
   AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
   VertexIterator adj_matrix_col_key_first,
-  VertexIterator map_key_first,
-  VertexIterator map_key_last,
+  VertexIterator2 map_key_first,
+  VertexIterator2 map_key_last,
   ValueIterator map_value_first,
   KeyAggregatedEdgeOp key_aggregated_e_op,
   ReduceOp reduce_op,

From e8ab13aba4b20b8ae97bf300857bb9c8563ed204 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Feb 2021 18:53:09 -0500
Subject: [PATCH 04/18] use compute_out_weight_sums, add MG unit test

---
 cpp/src/experimental/louvain.cuh         |  18 +-
 cpp/tests/CMakeLists.txt                 |  12 ++
 cpp/tests/community/louvain_mg_test.cpp  | 213 +++++++++++++++++++++++
 cpp/tests/utilities/mg_test_utilities.cu |   7 +
 4 files changed, 237 insertions(+), 13 deletions(-)
 create mode 100644 cpp/tests/community/louvain_mg_test.cpp

diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 5e70d32ac94..b0df1f2a757 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -183,18 +183,7 @@ class Louvain {
   {
     timer_start("compute_vertex_and_cluster_weights");
 
-    //
-    // TODO: Once PR 1394 is merged, this can be replaced by:
-    //    vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
-    //
-    experimental::copy_v_transform_reduce_out_nbr(
-      handle_,
-      current_graph_view_,
-      thrust::make_constant_iterator(0),
-      thrust::make_constant_iterator(0),
-      [] __device__(auto src, auto, auto wt, auto, auto) { return wt; },
-      weight_t{0},
-      vertex_weights_v_.begin());
+    vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
 
     thrust::copy(rmm::exec_policy(stream_)->on(stream_),
                  vertex_weights_v_.begin(),
@@ -301,6 +290,7 @@ class Louvain {
     rmm::device_uvector<vertex_t> tmp_cluster_keys_v(0, stream_);
     rmm::device_uvector<weight_t> tmp_cluster_weights_v(0, stream_);
 
+    // TODO: These two calls could be combined and return a pair, I would think that would be faster
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
@@ -408,7 +398,9 @@ class Louvain {
     thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
                     tmp_cluster_weights_v.begin(),
                     tmp_cluster_weights_v.end(),
-                    tmp_cluster_keys_v.begin(),
+                    thrust::make_transform_iterator(tmp_cluster_keys_v.begin(),
+                                                    [base_vertex_id = base_vertex_id_] __device__(
+                                                      auto key) { return key - base_vertex_id; }),
                     cluster_weights_v_.begin());
 
     std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties(
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 68b277871b1..daf43a9f87a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -438,6 +438,18 @@ if(BUILD_CUGRAPH_MG_TESTS)
         ConfigureTest(MG_PAGERANK_TEST "${MG_PAGERANK_TEST_SRCS}")
         target_link_libraries(MG_PAGERANK_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
 
+        ###########################################################################################
+        # - MG LOUVAIN tests ---------------------------------------------------------------------
+
+        set(MG_LOUVAIN_TEST_SRCS
+            "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
+            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/mg_test_utilities.cu"
+            "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp")
+
+        ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}")
+        target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+
     else(MPI_CXX_FOUND)
        message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.")
     endif(MPI_CXX_FOUND)
diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/louvain_mg_test.cpp
new file mode 100644
index 00000000000..06715d52bc0
--- /dev/null
+++ b/cpp/tests/community/louvain_mg_test.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/mg_test_utilities.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <partition_manager.hpp>
+
+#include <gtest/gtest.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// Test param object. This defines the input and expected output for a test, and
+// will be instantiated as the parameter to the tests defined below using
+// INSTANTIATE_TEST_CASE_P()
+//
+struct Louvain_Testparams {
+  std::string graph_file_full_path{};
+  bool weighted{false};
+  size_t max_level;
+  double resolution;
+
+  // TODO:  We really should have a Graph_Testparms_Base class or something
+  //        like that which can handle this graph_full_path thing.
+  //
+  Louvain_Testparams(std::string const& graph_file_path,
+                     bool weighted,
+                     size_t max_level,
+                     double resolution)
+    : weighted(weighted), max_level(max_level), resolution(resolution)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Parameterized test fixture, to be used with TEST_P().  This defines common
+// setup and teardown steps as well as common utilities used by each E2E MG
+// test.  In this case, each test is identical except for the inputs and
+// expected outputs, so the entire test is defined in the run_test() method.
+//
+class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t,
+                               public ::testing::WithParamInterface<Louvain_Testparams> {
+ public:
+  // Run once for each test instance
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  // Return the results of running louvain on a single GPU for the dataset in
+  // graph_file_path.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  std::tuple<int, weight_t, std::vector<vertex_t>> get_sg_results(
+    raft::handle_t const& handle,
+    std::string const& graph_file_path,
+    size_t max_level,
+    weight_t resolution)
+  {
+    // TODO:  Put this in the Graph test base class
+    //        (make the call simpler here)
+    auto graph =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
+        handle, graph_file_path, true);  // FIXME: should use param.test_weighted instead of true
+
+    auto graph_view     = graph.view();
+    cudaStream_t stream = handle.get_stream();
+
+    rmm::device_uvector<vertex_t> clustering_v(graph_view.get_number_of_local_vertices(), stream);
+
+    size_t level;
+    weight_t modularity;
+
+    std::tie(level, modularity) =
+      cugraph::louvain(handle, graph_view, clustering_v.data(), max_level, resolution);
+
+    std::vector<vertex_t> clustering(graph_view.get_number_of_local_vertices());
+    raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream);
+
+    return std::make_tuple(level, modularity, clustering);
+  }
+
+  // Compare the results of running louvain on multiple GPUs to that of a
+  // single-GPU run for the configuration in param.
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_test(const Louvain_Testparams& param)
+  {
+    raft::handle_t handle;
+    raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
+    const auto& comm = handle.get_comms();
+
+    cudaStream_t stream = handle.get_stream();
+
+    // Assuming 2 GPUs which means 1 row, 2 cols. 2 cols = row_comm_size of 2.
+    // FIXME: DO NOT ASSUME 2 GPUs, add code to compute prows, pcols
+    size_t row_comm_size{2};
+    cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
+      subcomm_factory(handle, row_comm_size);
+
+    int my_rank = comm.get_rank();
+
+    // FIXME: graph must be weighted!
+    std::unique_ptr<cugraph::experimental::
+                      graph_t<vertex_t, edge_t, weight_t, false, true>>  // store_transposed=false,
+                                                                         // multi_gpu=true
+      mg_graph_ptr{};
+    rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
+
+    std::tie(mg_graph_ptr, d_renumber_map_labels) = cugraph::test::
+      create_graph_for_gpu<vertex_t, edge_t, weight_t, false>  // store_transposed=true
+      (handle, param.graph_file_full_path);
+
+    auto mg_graph_view = mg_graph_ptr->view();
+
+    rmm::device_uvector<vertex_t> clustering_v(mg_graph_view.get_number_of_local_vertices(),
+                                               stream);
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    int level;
+    weight_t modularity;
+
+    std::cout << "calling MG louvain" << std::endl;
+
+    std::tie(level, modularity) = cugraph::louvain(
+      handle, mg_graph_view, clustering_v.data(), param.max_level, param.resolution);
+
+    std::vector<vertex_t> clustering(mg_graph_view.get_number_of_local_vertices());
+
+    raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream);
+
+    std::vector<vertex_t> h_renumber_map_labels(mg_graph_view.get_number_of_vertices());
+    raft::update_host(h_renumber_map_labels.data(),
+                      d_renumber_map_labels.data(),
+                      d_renumber_map_labels.size(),
+                      stream);
+
+    // Compare MG to SG
+
+    // Each GPU will have a subset of the clustering
+    int sg_level;
+    weight_t sg_modularity;
+    std::vector<vertex_t> sg_clustering;
+
+    std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results<vertex_t, edge_t, weight_t>(
+      handle, param.graph_file_full_path, param.max_level, param.resolution);
+
+    std::cout << "MG:  level = " << level << ", modularity = " << modularity << std::endl;
+    raft::print_host_vector("clustering", clustering.data(), clustering.size(), std::cout);
+
+    std::cout << "SG:  level = " << sg_level << ", modularity = " << sg_modularity << std::endl;
+    raft::print_host_vector("clustering", sg_clustering.data(), sg_clustering.size(), std::cout);
+
+#if 0
+    // For this test, each GPU will have the full set of vertices and
+    // therefore the pageranks vectors should be equal in size.
+    ASSERT_EQ(h_sg_pageranks.size(), h_mg_pageranks.size());
+
+    auto threshold_ratio = 1e-3;
+    auto threshold_magnitude =
+      (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
+      threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
+    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
+      return std::abs(lhs - rhs) <
+             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
+    };
+
+    vertex_t mapped_vertex{0};
+    for (vertex_t i = 0;
+         i + mg_graph_view.get_local_vertex_first() < mg_graph_view.get_local_vertex_last();
+         ++i) {
+      mapped_vertex = h_renumber_map_labels[i];
+      ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex]))
+        << "MG PageRank value for vertex: " << i << " in rank: " << my_rank
+        << " has value: " << h_mg_pageranks[i]
+        << " which exceeds the error margin for comparing to SG value: " << h_sg_pageranks[i];
+    }
+#endif
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+TEST_P(Louvain_MG_Testfixture, CheckInt32Int32FloatFloat)
+{
+  run_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  e2e,
+  Louvain_MG_Testfixture,
+  ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1)
+                    // Louvain_Testparams("test/datasets/webbase-1M.mtx", true, 100, 1),
+                    ));
+
+// FIXME: Enable proper RMM configuration by using CUGRAPH_TEST_PROGRAM_MAIN().
+//        Currently seeing a RMM failure during init, need to investigate.
+// CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/mg_test_utilities.cu b/cpp/tests/utilities/mg_test_utilities.cu
index 26f2450b589..f3fe1480527 100644
--- a/cpp/tests/utilities/mg_test_utilities.cu
+++ b/cpp/tests/utilities/mg_test_utilities.cu
@@ -176,5 +176,12 @@ template std::tuple<
   rmm::device_uvector<int32_t>>
 create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path);
 
+template std::tuple<
+  std::unique_ptr<
+    cugraph::experimental::graph_t<int32_t, int32_t, float, false, true>>,  // store_transposed=true
+                                                                            // multi_gpu=true
+  rmm::device_uvector<int32_t>>
+create_graph_for_gpu(raft::handle_t& handle, const std::string& graph_file_path);
+
 }  // namespace test
 }  // namespace cugraph

From 9e4421e743f7ddbbf64a1eefcfa46e3ef4846c53 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Mon, 15 Mar 2021 19:13:24 -0400
Subject: [PATCH 05/18] MNMG Louvain working correctly

---
 cpp/include/utilities/device_comm.cuh         |  12 +-
 cpp/src/community/dendrogram.cuh              |   7 +
 cpp/src/community/ecg.cu                      |   2 +-
 cpp/src/community/flatten_dendrogram.cuh      |  37 ++-
 cpp/src/community/leiden.cuh                  |   2 +-
 cpp/src/community/louvain.cuh                 |   2 +-
 cpp/src/experimental/louvain.cuh              | 289 ++++++++++++------
 cpp/tests/CMakeLists.txt                      |   3 -
 cpp/tests/community/louvain_mg_test.cpp       | 148 ++++-----
 .../utilities/generate_graph_from_edgelist.cu | 210 +++++++++----
 10 files changed, 450 insertions(+), 262 deletions(-)

diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh
index 7b9956902cc..e0bbea76068 100644
--- a/cpp/include/utilities/device_comm.cuh
+++ b/cpp/include/utilities/device_comm.cuh
@@ -238,10 +238,12 @@ template <typename InputIterator, typename OutputIterator, size_t I>
 struct device_sendrecv_tuple_iterator_element_impl<InputIterator, OutputIterator, I, I> {
   void run(raft::comms::comms_t const& comm,
            InputIterator input_first,
-           size_t count,
+           size_t tx_count,
            int dst,
-           int base_tag,
-           raft::comms::request_t* requests) const
+           OutputIterator output_first,
+           size_t rx_count,
+           int src,
+           cudaStream_t stream) const
   {
   }
 };
@@ -460,7 +462,7 @@ struct device_reduce_tuple_iterator_element_impl {
                        op,
                        root,
                        stream);
-    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>(
+    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, I + 1, N>().run(
       comm, input_first, output_first, count, op, root, stream);
   }
 };
@@ -890,7 +892,7 @@ device_reduce(raft::comms::comms_t const& comm,
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
   detail::
-    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>(
+    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>().run(
       comm, input_first, output_first, count, op, root, stream);
 }
 
diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh
index 414f5f3854d..eed34064d9a 100644
--- a/cpp/src/community/dendrogram.cuh
+++ b/cpp/src/community/dendrogram.cuh
@@ -26,12 +26,14 @@ template <typename vertex_t>
 class Dendrogram {
  public:
   void add_level(vertex_t num_verts,
+                 vertex_t first_index,
                  cudaStream_t stream                 = 0,
                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     level_ptr_.push_back(
       std::make_unique<rmm::device_buffer>(num_verts * sizeof(vertex_t), stream, mr));
     level_size_.push_back(num_verts);
+    level_first_index_.push_back(first_index);
   }
 
   size_t current_level() const { return level_size_.size() - 1; }
@@ -50,6 +52,8 @@ class Dendrogram {
 
   vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; }
 
+  vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; }
+
   vertex_t const *current_level_begin() const { return get_level_ptr_nocheck(current_level()); }
 
   vertex_t const *current_level_end() const { return current_level_begin() + current_level_size(); }
@@ -60,8 +64,11 @@ class Dendrogram {
 
   vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); }
 
+  vertex_t current_level_first_index() const { return get_level_first_index_nocheck(current_level()); }
+
  private:
   std::vector<vertex_t> level_size_;
+  std::vector<vertex_t> level_first_index_;
   std::vector<std::unique_ptr<rmm::device_buffer>> level_ptr_;
 };
 
diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu
index 994204ecd32..1563d9393d8 100644
--- a/cpp/src/community/ecg.cu
+++ b/cpp/src/community/ecg.cu
@@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain<graph_type> {
 
   void initialize_dendrogram_level(vertex_t num_vertices) override
   {
-    this->dendrogram_->add_level(num_vertices);
+    this->dendrogram_->add_level(num_vertices, 0);
 
     get_permutation_vector(
       num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_);
diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh
index 892fe2d1c51..6961d7c9b87 100644
--- a/cpp/src/community/flatten_dendrogram.cuh
+++ b/cpp/src/community/flatten_dendrogram.cuh
@@ -31,23 +31,28 @@ void partition_at_level(raft::handle_t const &handle,
                         size_t level)
 {
   vertex_t local_num_verts = dendrogram.get_level_size_nocheck(0);
+  rmm::device_uvector<vertex_t> local_vertex_ids_v(local_num_verts, handle.get_stream());
 
-  thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-               d_vertex_ids,
-               d_vertex_ids + local_num_verts,
-               d_partition);
-
-  std::for_each(thrust::make_counting_iterator<size_t>(0),
-                thrust::make_counting_iterator<size_t>(level),
-                [&handle, &dendrogram, d_vertex_ids, &d_partition, local_num_verts](size_t l) {
-                  cugraph::experimental::relabel<vertex_t, multi_gpu>(
-                    handle,
-                    std::tuple<vertex_t const *, vertex_t const *>(
-                      d_vertex_ids, dendrogram.get_level_ptr_nocheck(l)),
-                    dendrogram.get_level_size_nocheck(l),
-                    d_partition,
-                    local_num_verts);
-                });
+  raft::copy(d_partition, d_vertex_ids, local_num_verts, handle.get_stream());
+
+  std::for_each(
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(level),
+    [&handle, &dendrogram, &local_vertex_ids_v, d_vertex_ids, &d_partition, local_num_verts](
+      size_t l) {
+      thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       local_vertex_ids_v.begin(),
+                       local_vertex_ids_v.begin() + dendrogram.get_level_size_nocheck(l),
+                       dendrogram.get_level_first_index_nocheck(l));
+
+      cugraph::experimental::relabel<vertex_t, multi_gpu>(
+        handle,
+        std::tuple<vertex_t const *, vertex_t const *>(local_vertex_ids_v.data(),
+                                                       dendrogram.get_level_ptr_nocheck(l)),
+        dendrogram.get_level_size_nocheck(l),
+        d_partition,
+        local_num_verts);
+    });
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index 141f8beac40..3c3c7815aa8 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -132,7 +132,7 @@ class Leiden : public Louvain<graph_type> {
       //
       //  Initialize every cluster to reference each vertex to itself
       //
-      this->dendrogram_->add_level(current_graph.number_of_vertices);
+      this->dendrogram_->add_level(current_graph.number_of_vertices, 0);
 
       thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
                        this->dendrogram_->current_level_begin(),
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index a0f7f1c2421..ae72cd8d44b 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -210,7 +210,7 @@ class Louvain {
 
   virtual void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(num_vertices);
+    dendrogram_->add_level(num_vertices, 0);
 
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      dendrogram_->current_level_begin(),
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index b0df1f2a757..e3ae267e710 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -26,6 +26,9 @@
 #include <patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh>
 #include <patterns/transform_reduce_e.cuh>
 #include <patterns/transform_reduce_v.cuh>
+#include <utilities/collect_comm.cuh>
+
+#include <thrust/transform_reduce.h>
 
 //#define TIMING
 
@@ -57,19 +60,16 @@ class Louvain {
       handle_(handle),
       dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
-      local_num_vertices_(graph_view.get_number_of_local_vertices()),
-      vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
+      cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
       cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
+      vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
       src_vertex_weights_cache_v_(0, handle.get_stream()),
-      src_cluster_weights_cache_v_(0, handle.get_stream()),
-      dst_cluster_weights_cache_v_(0, handle.get_stream()),
       src_cluster_cache_v_(0, handle.get_stream()),
       dst_cluster_cache_v_(0, handle.get_stream()),
       stream_(handle.get_stream())
   {
     if (graph_view_t::is_multi_gpu) {
       rank_           = handle.get_comms().get_rank();
-      base_vertex_id_ = graph_view.get_local_vertex_first();
     }
   }
 
@@ -88,7 +88,7 @@ class Louvain {
       current_graph_view_,
       thrust::make_constant_iterator(0),
       thrust::make_constant_iterator(0),
-      [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; },
+      [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; },
       weight_t{0});
 
     while (dendrogram_->num_levels() < max_level) {
@@ -141,23 +141,28 @@ class Louvain {
  protected:
   void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(num_vertices);
+    dendrogram_->add_level(num_vertices, current_graph_view_.get_local_vertex_first());
 
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      dendrogram_->current_level_begin(),
                      dendrogram_->current_level_end(),
-                     base_vertex_id_);
+                     current_graph_view_.get_local_vertex_first());
   }
 
  public:
   weight_t modularity(weight_t total_edge_weight, weight_t resolution)
   {
-    weight_t sum_degree_squared = experimental::transform_reduce_v(
-      handle_,
-      current_graph_view_,
+    weight_t sum_degree_squared = thrust::transform_reduce(
+      rmm::exec_policy(stream_)->on(stream_),
       cluster_weights_v_.begin(),
+      cluster_weights_v_.end(),
       [] __device__(weight_t p) { return p * p; },
-      weight_t{0});
+      weight_t{0},
+      thrust::plus<weight_t>());
+
+    if (graph_t::is_multi_gpu) {
+      sum_degree_squared = host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, stream_);
+    }
 
     weight_t sum_internal = experimental::transform_reduce_e(
       handle_,
@@ -185,16 +190,38 @@ class Louvain {
 
     vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
 
-    thrust::copy(rmm::exec_policy(stream_)->on(stream_),
-                 vertex_weights_v_.begin(),
-                 vertex_weights_v_.end(),
-                 cluster_weights_v_.begin());
+    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+                     cluster_keys_v_.begin(),
+                     cluster_keys_v_.end(),
+                     current_graph_view_.get_local_vertex_first());
+
+    raft::copy(
+      cluster_weights_v_.begin(), vertex_weights_v_.begin(), vertex_weights_v_.size(), stream_);
 
     d_src_vertex_weights_cache_ =
       cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_);
 
-    std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
+    if (graph_view_t::is_multi_gpu) {
+      auto const comm_size = handle_.get_comms().get_size();
+      rmm::device_uvector<vertex_t> rx_keys_v(0, handle_.get_stream());
+      rmm::device_uvector<weight_t> rx_weights_v(0, handle_.get_stream());
+
+      auto pair_first = thrust::make_zip_iterator(
+        thrust::make_tuple(cluster_keys_v_.begin(), cluster_weights_v_.begin()));
+
+      std::forward_as_tuple(std::tie(rx_keys_v, rx_weights_v), std::ignore) =
+        groupby_gpuid_and_shuffle_values(
+          handle_.get_comms(),
+          pair_first,
+          pair_first + current_graph_view_.get_number_of_local_vertices(),
+          [key_func =
+             cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{
+               comm_size}] __device__(auto val) { return key_func(thrust::get<0>(val)); },
+          handle_.get_stream());
+
+      cluster_keys_v_    = std::move(rx_keys_v);
+      cluster_weights_v_ = std::move(rx_weights_v);
+    }
 
     timer_stop(stream_);
   }
@@ -268,10 +295,10 @@ class Louvain {
       new_Q = modularity(total_edge_weight, resolution);
 
       if (new_Q > cur_Q) {
-        thrust::copy(rmm::exec_policy(stream_)->on(stream_),
-                     next_cluster_v.begin(),
-                     next_cluster_v.end(),
-                     dendrogram_->current_level_begin());
+        raft::copy(dendrogram_->current_level_begin(),
+                   next_cluster_v.begin(),
+                   next_cluster_v.size(),
+                   stream_);
       }
     }
 
@@ -279,46 +306,118 @@ class Louvain {
     return cur_Q;
   }
 
-  void update_by_delta_modularity(weight_t total_edge_weight,
-                                  weight_t resolution,
-                                  rmm::device_uvector<vertex_t> &next_cluster_v,
-                                  bool up_down)
+  void compute_cluster_sum_and_subtract(rmm::device_uvector<weight_t> &old_cluster_sum_v,
+                                        rmm::device_uvector<weight_t> &cluster_subtract_v)
   {
-    rmm::device_uvector<weight_t> old_cluster_sum_v(local_num_vertices_, stream_);
-    rmm::device_uvector<weight_t> cluster_subtract_v(local_num_vertices_, stream_);
-
-    rmm::device_uvector<vertex_t> tmp_cluster_keys_v(0, stream_);
-    rmm::device_uvector<weight_t> tmp_cluster_weights_v(0, stream_);
+    auto output_buffer =
+      cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<weight_t, weight_t>>(
+        current_graph_view_.get_number_of_local_vertices(), stream_);
 
-    // TODO: These two calls could be combined and return a pair, I would think that would be faster
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
       d_src_cluster_cache_,
       d_dst_cluster_cache_,
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
-        if ((src != dst) && (src_cluster == nbr_cluster)) {
-          return wt;
-        } else
-          return weight_t{0};
-      },
-      weight_t{0},
-      old_cluster_sum_v.begin());
+        weight_t subtract{0};
+        weight_t sum{0};
 
-    experimental::copy_v_transform_reduce_out_nbr(
-      handle_,
-      current_graph_view_,
-      d_src_cluster_cache_,
-      d_dst_cluster_cache_,
-      [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
-        return (src == dst) ? wt : weight_t{0};
+        if (src == dst)
+          subtract = wt;
+        else if (src_cluster == nbr_cluster)
+          sum = wt;
+
+        return thrust::make_tuple(subtract, sum);
       },
-      weight_t{0},
-      cluster_subtract_v.begin());
+      thrust::make_tuple(weight_t{0}, weight_t{0}),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer));
+
+    thrust::transform(
+      rmm::exec_policy(stream_)->on(stream_),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer) +
+        current_graph_view_.get_number_of_local_vertices(),
+      old_cluster_sum_v.begin(),
+      [] __device__(auto p) { return thrust::get<1>(p); });
+
+    thrust::transform(
+      rmm::exec_policy(stream_)->on(stream_),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer),
+      cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
+        output_buffer) +
+        current_graph_view_.get_number_of_local_vertices(),
+      cluster_subtract_v.begin(),
+      [] __device__(auto p) { return thrust::get<0>(p); });
+  }
+
+  void update_by_delta_modularity(weight_t total_edge_weight,
+                                  weight_t resolution,
+                                  rmm::device_uvector<vertex_t> &next_cluster_v,
+                                  bool up_down)
+  {
+    rmm::device_uvector<weight_t> old_cluster_sum_v(current_graph_view_.get_number_of_local_vertices(), stream_);
+    rmm::device_uvector<weight_t> cluster_subtract_v(current_graph_view_.get_number_of_local_vertices(), stream_);
+    rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(), stream_);
+    rmm::device_uvector<weight_t> dst_cluster_weights_v(next_cluster_v.size(), stream_);
+
+    compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v);
 
     auto output_buffer =
       cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
-        local_num_vertices_, stream_);
+        current_graph_view_.get_number_of_local_vertices(), stream_);
+
+    vertex_t *map_key_first;
+    vertex_t *map_key_last;
+    weight_t *map_value_first;
+
+    if (graph_t::is_multi_gpu) {
+      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t> vertex_to_gpu_id_op{
+        handle_.get_comms().get_size()};
+
+      src_cluster_weights_v = cugraph::experimental::collect_values_for_keys(
+        handle_.get_comms(),
+        cluster_keys_v_.begin(),
+        cluster_keys_v_.end(),
+        cluster_weights_v_.data(),
+        d_src_cluster_cache_,
+        d_src_cluster_cache_ + src_cluster_cache_v_.size(),
+        vertex_to_gpu_id_op,
+        stream_);
+
+      dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys(
+        handle_.get_comms(),
+        cluster_keys_v_.begin(),
+        cluster_keys_v_.end(),
+        cluster_weights_v_.data(),
+        d_dst_cluster_cache_,
+        d_dst_cluster_cache_ + dst_cluster_cache_v_.size(),
+        vertex_to_gpu_id_op,
+        stream_);
+
+      map_key_first   = d_dst_cluster_cache_;
+      map_key_last    = d_dst_cluster_cache_ + dst_cluster_cache_v_.size();
+      map_value_first = dst_cluster_weights_v.begin();
+    } else {
+      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
+                        next_cluster_v.begin(),
+                        next_cluster_v.end(),
+                        src_cluster_weights_v.begin(),
+                        [d_cluster_weights = cluster_weights_v_.data(),
+                         d_cluster_keys    = cluster_keys_v_.data(),
+                         num_clusters      = cluster_keys_v_.size()] __device__(vertex_t cluster) {
+                          auto pos = thrust::find(
+                            thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster);
+                          return d_cluster_weights[pos - d_cluster_keys];
+                        });
+
+      map_key_first   = d_src_cluster_cache_;
+      map_key_last    = d_src_cluster_cache_ + src_cluster_weights_v.size();
+      map_value_first = src_cluster_weights_v.begin();
+    }
 
     copy_v_transform_reduce_key_aggregated_out_nbr(
       handle_,
@@ -326,25 +425,20 @@ class Louvain {
       thrust::make_zip_iterator(thrust::make_tuple(old_cluster_sum_v.begin(),
                                                    d_src_vertex_weights_cache_,
                                                    cluster_subtract_v.begin(),
-                                                   d_src_cluster_cache_)),
+                                                   d_src_cluster_cache_,
+                                                   src_cluster_weights_v.begin())),
 
       d_dst_cluster_cache_,
-      thrust::make_counting_iterator<vertex_t>(base_vertex_id_),
-      thrust::make_counting_iterator<vertex_t>(base_vertex_id_ + local_num_vertices_),
-      d_dst_cluster_weights_cache_,
-      [base_vertex_id        = base_vertex_id_,
-       d_src_cluster_weights = d_src_cluster_weights_cache_,
-       total_edge_weight,
-       resolution] __device__(auto src,
-                              auto neighbor_cluster,
-                              auto new_cluster_sum,
-                              auto src_info,
-                              auto a_new) {
+      map_key_first,
+      map_key_last,
+      map_value_first,
+      [rank = rank_, total_edge_weight, resolution] __device__(
+        auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
         auto old_cluster_sum  = thrust::get<0>(src_info);
         auto k_k              = thrust::get<1>(src_info);
         auto cluster_subtract = thrust::get<2>(src_info);
         auto src_cluster      = thrust::get<3>(src_info);
-        auto a_old            = d_src_cluster_weights[src_cluster];
+        auto a_old            = thrust::get<4>(src_info);
 
         if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
 
@@ -352,10 +446,31 @@ class Louvain {
                                          resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
                                            (total_edge_weight * total_edge_weight));
 
+#if 0
+          printf(
+            "rank = %d, src = %d, neighbor_cluster = %d, new_cluster_sum = %g, old_cluster_sum = "
+            "%g, "
+            "a_new = %g, a_old = %g, k_k = %g, delta_modularity = %g\n",
+            (int)rank,
+            (int)src,
+            (int)neighbor_cluster,
+            (float)new_cluster_sum,
+            (float)old_cluster_sum,
+            (float)a_new,
+            (float)a_old,
+            (float)k_k,
+            (float)delta_modularity);
+#endif
+
         return thrust::make_tuple(neighbor_cluster, delta_modularity);
       },
       [] __device__(auto p1, auto p2) {
-        return (thrust::get<1>(p1) < thrust::get<1>(p2)) ? p2 : p1;
+        auto id1 = thrust::get<0>(p1);
+        auto id2 = thrust::get<0>(p2);
+        auto wt1 = thrust::get<1>(p1);
+        auto wt2 = thrust::get<1>(p2);
+
+        return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2));
       },
       thrust::make_tuple(vertex_t{-1}, weight_t{0}),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
@@ -368,10 +483,20 @@ class Louvain {
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
         output_buffer),
       next_cluster_v.begin(),
-      [up_down] __device__(vertex_t old_cluster, auto p) {
+      [rank = rank_, up_down] __device__(vertex_t old_cluster, auto p) {
         vertex_t new_cluster      = thrust::get<0>(p);
         weight_t delta_modularity = thrust::get<1>(p);
 
+#if 0
+        printf(
+          "rank = %d, old_cluster = %d, new_cluster = %d, delta_modularity = %g, up_down = %s\n",
+          (int)rank,
+          (int)old_cluster,
+          (int)new_cluster,
+          (float)delta_modularity,
+          up_down ? "true" : false);
+#endif
+
         return (delta_modularity > weight_t{0})
                  ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
                  : old_cluster;
@@ -380,31 +505,15 @@ class Louvain {
     std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) =
       cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
 
-    std::tie(tmp_cluster_keys_v, tmp_cluster_weights_v) =
+    std::tie(cluster_keys_v_, cluster_weights_v_) =
       cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e(
         handle_,
         current_graph_view_,
         thrust::make_constant_iterator(0),
         thrust::make_constant_iterator(0),
         d_src_cluster_cache_,
-        [] __device__(auto src, auto, auto wt, auto, auto) { return wt; },
+        [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; },
         weight_t{0});
-
-    thrust::fill(rmm::exec_policy(stream_)->on(stream_),
-                 cluster_weights_v_.begin(),
-                 cluster_weights_v_.end(),
-                 weight_t{0});
-
-    thrust::scatter(rmm::exec_policy(stream_)->on(stream_),
-                    tmp_cluster_weights_v.begin(),
-                    tmp_cluster_weights_v.end(),
-                    thrust::make_transform_iterator(tmp_cluster_keys_v.begin(),
-                                                    [base_vertex_id = base_vertex_id_] __device__(
-                                                      auto key) { return key - base_vertex_id; }),
-                    cluster_weights_v_.begin());
-
-    std::tie(d_src_cluster_weights_cache_, d_dst_cluster_weights_cache_) = cache_vertex_properties(
-      cluster_weights_v_, src_cluster_weights_cache_v_, dst_cluster_weights_cache_v_);
   }
 
   void shrink_graph()
@@ -418,20 +527,17 @@ class Louvain {
 
     current_graph_view_ = current_graph_->view();
 
-    local_num_vertices_ = current_graph_view_.get_number_of_local_vertices();
-    base_vertex_id_     = current_graph_view_.get_local_vertex_first();
-
     rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), stream_);
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      numbering_indices.begin(),
                      numbering_indices.end(),
-                     base_vertex_id_);
+                     current_graph_view_.get_local_vertex_first());
 
     relabel<vertex_t, graph_view_t::is_multi_gpu>(
       handle_,
       std::make_tuple(static_cast<vertex_t const *>(numbering_map.begin()),
                       static_cast<vertex_t const *>(numbering_indices.begin())),
-      local_num_vertices_,
+      current_graph_view_.get_number_of_local_vertices(),
       dendrogram_->current_level_begin(),
       dendrogram_->current_level_size());
 
@@ -444,8 +550,6 @@ class Louvain {
 
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
 
-  vertex_t local_num_vertices_;
-  vertex_t base_vertex_id_{0};
   int rank_{0};
 
   //
@@ -457,16 +561,13 @@ class Louvain {
   graph_view_t current_graph_view_;
 
   rmm::device_uvector<weight_t> vertex_weights_v_;
-  rmm::device_uvector<weight_t> cluster_weights_v_;
   rmm::device_uvector<weight_t> src_vertex_weights_cache_v_;
-  rmm::device_uvector<weight_t> src_cluster_weights_cache_v_;
-  rmm::device_uvector<weight_t> dst_cluster_weights_cache_v_;
   rmm::device_uvector<vertex_t> src_cluster_cache_v_;
   rmm::device_uvector<vertex_t> dst_cluster_cache_v_;
+  rmm::device_uvector<vertex_t> cluster_keys_v_;
+  rmm::device_uvector<weight_t> cluster_weights_v_;
 
   weight_t *d_src_vertex_weights_cache_;
-  weight_t *d_src_cluster_weights_cache_;
-  weight_t *d_dst_cluster_weights_cache_;
   vertex_t *d_src_cluster_cache_;
   vertex_t *d_dst_cluster_cache_;
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 813008a6fdf..d686125db29 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -427,9 +427,6 @@ if(BUILD_CUGRAPH_MG_TESTS)
         # - MG LOUVAIN tests ---------------------------------------------------------------------
 
         set(MG_LOUVAIN_TEST_SRCS
-            "${CMAKE_CURRENT_SOURCE_DIR}/../../thirdparty/mmio/mmio.c"
-            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/test_utilities.cpp"
-            "${CMAKE_CURRENT_SOURCE_DIR}/utilities/mg_test_utilities.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp")
 
         ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}")
diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/louvain_mg_test.cpp
index 06715d52bc0..3c14f56d3c8 100644
--- a/cpp/tests/community/louvain_mg_test.cpp
+++ b/cpp/tests/community/louvain_mg_test.cpp
@@ -15,14 +15,23 @@
  */
 
 #include <utilities/base_fixture.hpp>
-#include <utilities/mg_test_utilities.hpp>
 #include <utilities/test_utilities.hpp>
 
 #include <algorithms.hpp>
 #include <partition_manager.hpp>
 
+#include <raft/comms/comms.hpp>
+#include <raft/comms/mpi_comms.hpp>
+#include <raft/handle.hpp>
+
 #include <gtest/gtest.h>
 
+void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); }
+void compare(double modularity, double sg_modularity)
+{
+  ASSERT_DOUBLE_EQ(modularity, sg_modularity);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // Test param object. This defines the input and expected output for a test, and
 // will be instantiated as the parameter to the tests defined below using
@@ -34,8 +43,8 @@ struct Louvain_Testparams {
   size_t max_level;
   double resolution;
 
-  // TODO:  We really should have a Graph_Testparms_Base class or something
-  //        like that which can handle this graph_full_path thing.
+  // FIXME:  We really should have a Graph_Testparms_Base class or something
+  //         like that which can handle this graph_full_path thing.
   //
   Louvain_Testparams(std::string const& graph_file_path,
                      bool weighted,
@@ -57,9 +66,11 @@ struct Louvain_Testparams {
 // test.  In this case, each test is identical except for the inputs and
 // expected outputs, so the entire test is defined in the run_test() method.
 //
-class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t,
-                               public ::testing::WithParamInterface<Louvain_Testparams> {
+class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Testparams> {
  public:
+  static void SetUpTestCase() {}
+  static void TearDownTestCase() {}
+
   // Run once for each test instance
   virtual void SetUp() {}
   virtual void TearDown() {}
@@ -73,13 +84,16 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t,
     size_t max_level,
     weight_t resolution)
   {
-    // TODO:  Put this in the Graph test base class
-    //        (make the call simpler here)
-    auto graph =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false>(
-        handle, graph_file_path, true);  // FIXME: should use param.test_weighted instead of true
-
-    auto graph_view     = graph.view();
+    // FIXME:  Put this in the Graph test base class
+    //         (make the call simpler here)
+    auto graph_tuple =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle,
+        graph_file_path,
+        true,
+        false);  // FIXME: should use param.test_weighted instead of true
+
+    auto graph_view     = std::get<0>(graph_tuple).view();
     cudaStream_t stream = handle.get_stream();
 
     rmm::device_uvector<vertex_t> clustering_v(graph_view.get_number_of_local_vertices(), stream);
@@ -97,36 +111,46 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t,
   }
 
   // Compare the results of running louvain on multiple GPUs to that of a
-  // single-GPU run for the configuration in param.
+  // single-GPU run for the configuration in param.  Note that MNMG Louvain
+  // and single GPU Louvain are ONLY deterministic through a single
+  // iteration of the outer loop.  Renumbering of the partitions when coarsening
+  // the graph is a function of the number of GPUs in the GPU cluster.
   template <typename vertex_t, typename edge_t, typename weight_t>
   void run_test(const Louvain_Testparams& param)
   {
     raft::handle_t handle;
+
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
     const auto& comm = handle.get_comms();
 
-    cudaStream_t stream = handle.get_stream();
+    auto const comm_size = comm.get_size();
+    auto const comm_rank = comm.get_rank();
 
-    // Assuming 2 GPUs which means 1 row, 2 cols. 2 cols = row_comm_size of 2.
-    // FIXME: DO NOT ASSUME 2 GPUs, add code to compute prows, pcols
-    size_t row_comm_size{2};
+    auto row_comm_size = static_cast<int>(sqrt(static_cast<double>(comm_size)));
+    while (comm_size % row_comm_size != 0) { --row_comm_size; }
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
 
-    int my_rank = comm.get_rank();
+    cudaStream_t stream = handle.get_stream();
 
-    // FIXME: graph must be weighted!
-    std::unique_ptr<cugraph::experimental::
-                      graph_t<vertex_t, edge_t, weight_t, false, true>>  // store_transposed=false,
-                                                                         // multi_gpu=true
-      mg_graph_ptr{};
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, true> mg_graph(handle);
     rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
 
-    std::tie(mg_graph_ptr, d_renumber_map_labels) = cugraph::test::
-      create_graph_for_gpu<vertex_t, edge_t, weight_t, false>  // store_transposed=true
-      (handle, param.graph_file_full_path);
+    std::tie(mg_graph, d_renumber_map_labels) =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, true>(
+        handle, param.graph_file_full_path, true, false);
+
+    // Each GPU will have a subset of the clustering
+    int sg_level;
+    weight_t sg_modularity;
+    std::vector<vertex_t> sg_clustering;
+
+    // FIXME:  Consider how to test for max_level > 1
+    //         perhaps some sort of approximation
+    // size_t local_max_level{param.max_level};
+    size_t local_max_level{1};
 
-    auto mg_graph_view = mg_graph_ptr->view();
+    auto mg_graph_view = mg_graph.view();
 
     rmm::device_uvector<vertex_t> clustering_v(mg_graph_view.get_number_of_local_vertices(),
                                                stream);
@@ -136,78 +160,30 @@ class Louvain_MG_Testfixture : public cugraph::test::MG_TestFixture_t,
     int level;
     weight_t modularity;
 
-    std::cout << "calling MG louvain" << std::endl;
-
     std::tie(level, modularity) = cugraph::louvain(
-      handle, mg_graph_view, clustering_v.data(), param.max_level, param.resolution);
-
-    std::vector<vertex_t> clustering(mg_graph_view.get_number_of_local_vertices());
-
-    raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream);
-
-    std::vector<vertex_t> h_renumber_map_labels(mg_graph_view.get_number_of_vertices());
-    raft::update_host(h_renumber_map_labels.data(),
-                      d_renumber_map_labels.data(),
-                      d_renumber_map_labels.size(),
-                      stream);
+      handle, mg_graph_view, clustering_v.data(), local_max_level, param.resolution);
 
-    // Compare MG to SG
+    if (comm_rank == 0) {
+      SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path);
 
-    // Each GPU will have a subset of the clustering
-    int sg_level;
-    weight_t sg_modularity;
-    std::vector<vertex_t> sg_clustering;
+      std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results<vertex_t, edge_t, weight_t>(
+        handle, param.graph_file_full_path, local_max_level, param.resolution);
 
-    std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results<vertex_t, edge_t, weight_t>(
-      handle, param.graph_file_full_path, param.max_level, param.resolution);
-
-    std::cout << "MG:  level = " << level << ", modularity = " << modularity << std::endl;
-    raft::print_host_vector("clustering", clustering.data(), clustering.size(), std::cout);
-
-    std::cout << "SG:  level = " << sg_level << ", modularity = " << sg_modularity << std::endl;
-    raft::print_host_vector("clustering", sg_clustering.data(), sg_clustering.size(), std::cout);
-
-#if 0
-    // For this test, each GPU will have the full set of vertices and
-    // therefore the pageranks vectors should be equal in size.
-    ASSERT_EQ(h_sg_pageranks.size(), h_mg_pageranks.size());
-
-    auto threshold_ratio = 1e-3;
-    auto threshold_magnitude =
-      (1.0 / static_cast<result_t>(mg_graph_view.get_number_of_vertices())) *
-      threshold_ratio;  // skip comparison for low PageRank verties (lowly ranked vertices)
-    auto nearly_equal = [threshold_ratio, threshold_magnitude](auto lhs, auto rhs) {
-      return std::abs(lhs - rhs) <
-             std::max(std::max(lhs, rhs) * threshold_ratio, threshold_magnitude);
-    };
-
-    vertex_t mapped_vertex{0};
-    for (vertex_t i = 0;
-         i + mg_graph_view.get_local_vertex_first() < mg_graph_view.get_local_vertex_last();
-         ++i) {
-      mapped_vertex = h_renumber_map_labels[i];
-      ASSERT_TRUE(nearly_equal(h_mg_pageranks[i], h_sg_pageranks[mapped_vertex]))
-        << "MG PageRank value for vertex: " << i << " in rank: " << my_rank
-        << " has value: " << h_mg_pageranks[i]
-        << " which exceeds the error margin for comparing to SG value: " << h_sg_pageranks[i];
+      compare(modularity, sg_modularity);
     }
-#endif
   }
 };
 
 ////////////////////////////////////////////////////////////////////////////////
-TEST_P(Louvain_MG_Testfixture, CheckInt32Int32FloatFloat)
+TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float)
 {
   run_test<int32_t, int32_t, float>(GetParam());
 }
 
 INSTANTIATE_TEST_CASE_P(
-  e2e,
+  simple_test,
   Louvain_MG_Testfixture,
-  ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1)
-                    // Louvain_Testparams("test/datasets/webbase-1M.mtx", true, 100, 1),
-                    ));
+  ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1),
+                    Louvain_Testparams("test/datasets/smallworld.mtx", true, 100, 1)));
 
-// FIXME: Enable proper RMM configuration by using CUGRAPH_TEST_PROGRAM_MAIN().
-//        Currently seeing a RMM failure during init, need to investigate.
-// CUGRAPH_TEST_PROGRAM_MAIN()
+CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu
index 1b9fe6051f7..fa10c31a5a4 100644
--- a/cpp/tests/utilities/generate_graph_from_edgelist.cu
+++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu
@@ -30,39 +30,47 @@ namespace test {
 
 namespace detail {
 
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool store_transposed,
-          bool multi_gpu>
-std::enable_if_t<
-  multi_gpu,
-  std::tuple<
-    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
-    rmm::device_uvector<vertex_t>>>
-generate_graph_from_edgelist(raft::handle_t const& handle,
-                             rmm::device_uvector<vertex_t>&& vertices,
-                             rmm::device_uvector<vertex_t>&& edgelist_rows,
-                             rmm::device_uvector<vertex_t>&& edgelist_cols,
-                             rmm::device_uvector<weight_t>&& edgelist_weights,
-                             bool is_symmetric,
-                             bool test_weighted,
-                             bool renumber)
-{
-  CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true.");
-
-  auto& comm               = handle.get_comms();
-  auto const comm_size     = comm.get_size();
-  auto const comm_rank     = comm.get_rank();
-  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_size = row_comm.get_size();
-  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_size = col_comm.get_size();
-
-  vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
+template <typename vertex_t>
+struct compute_gpu_id_from_vertex_no_renumbering {
+  int comm_size{0};
+  vertex_t num_vertices{1};
+
+  __device__ int operator()(vertex_t v) const
+  {
+    vertex_t vertices_per_gpu = static_cast<vertex_t>(num_vertices + comm_size - 1) / comm_size;
+    return static_cast<int>(v / vertices_per_gpu);
+  }
+};
+
+template <typename vertex_t>
+struct compute_gpu_id_from_edge_no_renumbering {
+  bool hypergraph_partitioned{false};
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+  vertex_t num_vertices{1};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    vertex_t vertices_per_gpu = static_cast<vertex_t>(num_vertices + comm_size - 1) / comm_size;
+    auto major_comm_rank      = static_cast<int>(major / vertices_per_gpu);
+    auto minor_comm_rank      = static_cast<int>(minor / vertices_per_gpu);
+
+    if (hypergraph_partitioned) {
+      return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+    } else {
+      return (major_comm_rank - (major_comm_rank % row_comm_size)) +
+             (minor_comm_rank / col_comm_size);
+    }
+  }
+};
 
-  auto vertex_key_func =
-    cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
+template <typename vertex_t, typename key_function_t>
+void filter_vertices(raft::handle_t const& handle,
+                     rmm::device_uvector<vertex_t>& vertices,
+                     key_function_t vertex_key_func,
+                     int comm_rank)
+{
   vertices.resize(thrust::distance(vertices.begin(),
                                    thrust::remove_if(
                                      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
@@ -72,10 +80,17 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
                                        return key_func(val) != comm_rank;
                                      })),
                   handle.get_stream());
-  vertices.shrink_to_fit(handle.get_stream());
+}
 
-  auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
-    false, comm_size, row_comm_size, col_comm_size};
+template <bool store_transposed, typename vertex_t, typename weight_t, typename key_function_t>
+void filter_edges(raft::handle_t const& handle,
+                  rmm::device_uvector<vertex_t>& edgelist_rows,
+                  rmm::device_uvector<vertex_t>& edgelist_cols,
+                  rmm::device_uvector<weight_t>& edgelist_weights,
+                  key_function_t edge_key_func,
+                  int comm_rank,
+                  bool test_weighted)
+{
   size_t number_of_local_edges{};
   if (test_weighted) {
     auto edge_first = thrust::make_zip_iterator(
@@ -106,35 +121,120 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
   }
 
   edgelist_rows.resize(number_of_local_edges, handle.get_stream());
-  edgelist_rows.shrink_to_fit(handle.get_stream());
   edgelist_cols.resize(number_of_local_edges, handle.get_stream());
-  edgelist_cols.shrink_to_fit(handle.get_stream());
-  if (test_weighted) {
-    edgelist_weights.resize(number_of_local_edges, handle.get_stream());
-    edgelist_weights.shrink_to_fit(handle.get_stream());
+  if (test_weighted) edgelist_weights.resize(number_of_local_edges, handle.get_stream());
+}
+
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    rmm::device_uvector<vertex_t>>>
+generate_graph_from_edgelist(raft::handle_t const& handle,
+                             rmm::device_uvector<vertex_t>&& vertices,
+                             rmm::device_uvector<vertex_t>&& edgelist_rows,
+                             rmm::device_uvector<vertex_t>&& edgelist_cols,
+                             rmm::device_uvector<weight_t>&& edgelist_weights,
+                             bool is_symmetric,
+                             bool test_weighted,
+                             bool renumber)
+{
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto const row_comm_rank = row_comm.get_rank();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+  auto const col_comm_rank = col_comm.get_rank();
+
+  vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
+  edge_t number_of_edges = static_cast<edge_t>(edgelist_rows.size());
+
+  if (renumber) {
+    filter_vertices(
+      handle,
+      vertices,
+      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
+      comm_rank);
+    filter_edges<store_transposed>(
+      handle,
+      edgelist_rows,
+      edgelist_cols,
+      edgelist_weights,
+      cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+        false, comm_size, row_comm_size, col_comm_size},
+      comm_rank,
+      test_weighted);
+  } else {
+    filter_vertices(
+      handle,
+      vertices,
+      compute_gpu_id_from_vertex_no_renumbering<vertex_t>{comm_size, number_of_vertices},
+      comm_rank);
+    filter_edges<store_transposed>(handle,
+                                   edgelist_rows,
+                                   edgelist_cols,
+                                   edgelist_weights,
+                                   compute_gpu_id_from_edge_no_renumbering<vertex_t>{
+                                     false, comm_size, row_comm_size, col_comm_size, number_of_vertices},
+                                   comm_rank,
+                                   test_weighted);
   }
 
+  vertices.shrink_to_fit(handle.get_stream());
+  edgelist_rows.shrink_to_fit(handle.get_stream());
+  edgelist_cols.shrink_to_fit(handle.get_stream());
+  if (test_weighted) edgelist_weights.shrink_to_fit(handle.get_stream());
+
   // 3. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   cugraph::experimental::partition_t<vertex_t> partition{};
   vertex_t aggregate_number_of_vertices{};
-  edge_t number_of_edges{};
-  // FIXME: set do_expensive_check to false once validated
-  std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) =
-    cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-      handle,
-      vertices.data(),
-      static_cast<vertex_t>(vertices.size()),
-      store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
-      store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
-      edgelist_rows.size(),
-      false,
-      true);
-  assert(aggregate_number_of_vertices == number_of_vertices);
 
-  // 4. create a graph
+  if (renumber) {
+    // FIXME: set do_expensive_check to false once validated
+    std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) =
+      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+        handle,
+        vertices.data(),
+        static_cast<vertex_t>(vertices.size()),
+        store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
+        store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
+        edgelist_rows.size(),
+        false,
+        true);
+    assert(aggregate_number_of_vertices == number_of_vertices);
+  } else {
+    std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
+    vertex_t vertices_per_gpu =
+      static_cast<vertex_t>(number_of_vertices + comm_size - 1) / comm_size;
+
+    std::for_each(thrust::make_counting_iterator(0),
+                  thrust::make_counting_iterator(comm_size),
+                  [h_vertex_partition_offsets = vertex_partition_offsets.data(), vertices_per_gpu](
+                    auto idx) { h_vertex_partition_offsets[idx] = idx * vertices_per_gpu; });
+
+    vertex_partition_offsets[comm_size] = number_of_vertices;
 
+    partition = cugraph::experimental::partition_t<vertex_t>(vertex_partition_offsets,
+                                                             false,  // is_hypergraph_partitioned,
+                                                             row_comm_size,
+                                                             col_comm_size,
+                                                             row_comm_rank,
+                                                             col_comm_rank);
+
+    //number_of_edges = edgelist_rows.size();
+  }
+
+  // 4. create a graph
   return std::make_tuple(
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
@@ -148,7 +248,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
       number_of_vertices,
       number_of_edges,
       cugraph::experimental::graph_properties_t{is_symmetric, false},
-      true,
+      false,
       true),
     std::move(renumber_map_labels));
 }

From 7cb4c135b364bd97ef6e15a768bf4dbc093f30e5 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Tue, 16 Mar 2021 15:03:40 -0400
Subject: [PATCH 06/18] respond to PR comments

---
 cpp/src/community/dendrogram.cuh |   4 +-
 cpp/src/community/ecg.cu         |   2 +-
 cpp/src/community/leiden.cuh     |   2 +-
 cpp/src/community/louvain.cuh    |   2 +-
 cpp/src/experimental/louvain.cuh | 145 +++++++++++++------------------
 5 files changed, 66 insertions(+), 89 deletions(-)

diff --git a/cpp/src/community/dendrogram.cuh b/cpp/src/community/dendrogram.cuh
index eed34064d9a..9389de3292d 100644
--- a/cpp/src/community/dendrogram.cuh
+++ b/cpp/src/community/dendrogram.cuh
@@ -25,8 +25,8 @@ namespace cugraph {
 template <typename vertex_t>
 class Dendrogram {
  public:
-  void add_level(vertex_t num_verts,
-                 vertex_t first_index,
+  void add_level(vertex_t first_index,
+                 vertex_t num_verts,
                  cudaStream_t stream                 = 0,
                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
diff --git a/cpp/src/community/ecg.cu b/cpp/src/community/ecg.cu
index 1563d9393d8..45f7d723191 100644
--- a/cpp/src/community/ecg.cu
+++ b/cpp/src/community/ecg.cu
@@ -117,7 +117,7 @@ class EcgLouvain : public cugraph::Louvain<graph_type> {
 
   void initialize_dendrogram_level(vertex_t num_vertices) override
   {
-    this->dendrogram_->add_level(num_vertices, 0);
+    this->dendrogram_->add_level(0, num_vertices);
 
     get_permutation_vector(
       num_vertices, seed_, this->dendrogram_->current_level_begin(), this->stream_);
diff --git a/cpp/src/community/leiden.cuh b/cpp/src/community/leiden.cuh
index 3c3c7815aa8..aae2d3712b5 100644
--- a/cpp/src/community/leiden.cuh
+++ b/cpp/src/community/leiden.cuh
@@ -132,7 +132,7 @@ class Leiden : public Louvain<graph_type> {
       //
       //  Initialize every cluster to reference each vertex to itself
       //
-      this->dendrogram_->add_level(current_graph.number_of_vertices, 0);
+      this->dendrogram_->add_level(0, current_graph.number_of_vertices);
 
       thrust::sequence(rmm::exec_policy(this->stream_)->on(this->stream_),
                        this->dendrogram_->current_level_begin(),
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index ae72cd8d44b..b81e619e747 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -210,7 +210,7 @@ class Louvain {
 
   virtual void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(num_vertices, 0);
+    dendrogram_->add_level(0, num_vertices);
 
     thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
                      dendrogram_->current_level_begin(),
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index e3ae267e710..60086341e4b 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -65,12 +65,8 @@ class Louvain {
       vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream()),
       src_vertex_weights_cache_v_(0, handle.get_stream()),
       src_cluster_cache_v_(0, handle.get_stream()),
-      dst_cluster_cache_v_(0, handle.get_stream()),
-      stream_(handle.get_stream())
+      dst_cluster_cache_v_(0, handle.get_stream())
   {
-    if (graph_view_t::is_multi_gpu) {
-      rank_           = handle.get_comms().get_rank();
-    }
   }
 
   Dendrogram<vertex_t> const &get_dendrogram() const { return *dendrogram_; }
@@ -117,14 +113,23 @@ class Louvain {
   void timer_start(std::string const &region)
   {
 #ifdef TIMING
-    if (rank_ == 0) hr_timer_.start(region);
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) hr_timer_.start(region);
+    } else {
+      hr_timer_.start(region);
+    }
 #endif
   }
 
   void timer_stop(cudaStream_t stream)
   {
 #ifdef TIMING
-    if (rank_ == 0) {
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) {
+        CUDA_TRY(cudaStreamSynchronize(stream));
+        hr_timer_.stop();
+      }
+    } else {
       CUDA_TRY(cudaStreamSynchronize(stream));
       hr_timer_.stop();
     }
@@ -134,16 +139,20 @@ class Louvain {
   void timer_display(std::ostream &os)
   {
 #ifdef TIMING
-    if (rank_ == 0) hr_timer_.display(os);
+    if (graph_view_t::is_multi_gpu) {
+      if (handle.get_comms().get_rank() == 0) hr_timer_.display(os);
+    } else {
+      hr_timer_.display(os);
+    }
 #endif
   }
 
  protected:
   void initialize_dendrogram_level(vertex_t num_vertices)
   {
-    dendrogram_->add_level(num_vertices, current_graph_view_.get_local_vertex_first());
+    dendrogram_->add_level(current_graph_view_.get_local_vertex_first(), num_vertices);
 
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+    thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                      dendrogram_->current_level_begin(),
                      dendrogram_->current_level_end(),
                      current_graph_view_.get_local_vertex_first());
@@ -153,7 +162,7 @@ class Louvain {
   weight_t modularity(weight_t total_edge_weight, weight_t resolution)
   {
     weight_t sum_degree_squared = thrust::transform_reduce(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
       cluster_weights_v_.begin(),
       cluster_weights_v_.end(),
       [] __device__(weight_t p) { return p * p; },
@@ -161,7 +170,8 @@ class Louvain {
       thrust::plus<weight_t>());
 
     if (graph_t::is_multi_gpu) {
-      sum_degree_squared = host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, stream_);
+      sum_degree_squared =
+        host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream());
     }
 
     weight_t sum_internal = experimental::transform_reduce_e(
@@ -190,13 +200,15 @@ class Louvain {
 
     vertex_weights_v_ = current_graph_view_.compute_out_weight_sums(handle_);
 
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+    thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                      cluster_keys_v_.begin(),
                      cluster_keys_v_.end(),
                      current_graph_view_.get_local_vertex_first());
 
-    raft::copy(
-      cluster_weights_v_.begin(), vertex_weights_v_.begin(), vertex_weights_v_.size(), stream_);
+    raft::copy(cluster_weights_v_.begin(),
+               vertex_weights_v_.begin(),
+               vertex_weights_v_.size(),
+               handle_.get_stream());
 
     d_src_vertex_weights_cache_ =
       cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_);
@@ -223,7 +235,7 @@ class Louvain {
       cluster_weights_v_ = std::move(rx_weights_v);
     }
 
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream());
   }
 
   template <typename T>
@@ -231,7 +243,7 @@ class Louvain {
   {
     if (graph_view_t::is_multi_gpu) {
       src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(),
-                         stream_);
+                         handle_.get_stream());
       copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin());
       return src_cache_v.begin();
     } else {
@@ -244,7 +256,7 @@ class Louvain {
   {
     if (graph_view_t::is_multi_gpu) {
       dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(),
-                         stream_);
+                         handle_.get_stream());
       copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin());
       return dst_cache_v.begin();
     } else {
@@ -252,30 +264,20 @@ class Louvain {
     }
   }
 
-  template <typename T>
-  std::tuple<T *, T *> cache_vertex_properties(rmm::device_uvector<T> &input,
-                                               rmm::device_uvector<T> &src_cache_v,
-                                               rmm::device_uvector<T> &dst_cache_v)
-  {
-    auto src = cache_src_vertex_properties(input, src_cache_v);
-    auto dst = cache_dst_vertex_properties(input, dst_cache_v);
-
-    return std::make_tuple(src, dst);
-  }
-
   virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution)
   {
     timer_start("update_clustering");
 
-    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(), stream_);
+    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(),
+                                                 handle_.get_stream());
 
     raft::copy(next_cluster_v.begin(),
                dendrogram_->current_level_begin(),
                dendrogram_->current_level_size(),
-               stream_);
+               handle_.get_stream());
 
-    std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) =
-      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
+    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -298,11 +300,11 @@ class Louvain {
         raft::copy(dendrogram_->current_level_begin(),
                    next_cluster_v.begin(),
                    next_cluster_v.size(),
-                   stream_);
+                   handle_.get_stream());
       }
     }
 
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream());
     return cur_Q;
   }
 
@@ -311,7 +313,7 @@ class Louvain {
   {
     auto output_buffer =
       cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<weight_t, weight_t>>(
-        current_graph_view_.get_number_of_local_vertices(), stream_);
+        current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
 
     experimental::copy_v_transform_reduce_out_nbr(
       handle_,
@@ -334,7 +336,7 @@ class Louvain {
         output_buffer));
 
     thrust::transform(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
         output_buffer),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
@@ -344,7 +346,7 @@ class Louvain {
       [] __device__(auto p) { return thrust::get<1>(p); });
 
     thrust::transform(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
         output_buffer),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(
@@ -359,16 +361,20 @@ class Louvain {
                                   rmm::device_uvector<vertex_t> &next_cluster_v,
                                   bool up_down)
   {
-    rmm::device_uvector<weight_t> old_cluster_sum_v(current_graph_view_.get_number_of_local_vertices(), stream_);
-    rmm::device_uvector<weight_t> cluster_subtract_v(current_graph_view_.get_number_of_local_vertices(), stream_);
-    rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(), stream_);
-    rmm::device_uvector<weight_t> dst_cluster_weights_v(next_cluster_v.size(), stream_);
+    rmm::device_uvector<weight_t> old_cluster_sum_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+    rmm::device_uvector<weight_t> cluster_subtract_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+    rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(),
+                                                        handle_.get_stream());
+    rmm::device_uvector<weight_t> dst_cluster_weights_v(next_cluster_v.size(),
+                                                        handle_.get_stream());
 
     compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v);
 
     auto output_buffer =
       cugraph::experimental::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
-        current_graph_view_.get_number_of_local_vertices(), stream_);
+        current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
 
     vertex_t *map_key_first;
     vertex_t *map_key_last;
@@ -386,7 +392,7 @@ class Louvain {
         d_src_cluster_cache_,
         d_src_cluster_cache_ + src_cluster_cache_v_.size(),
         vertex_to_gpu_id_op,
-        stream_);
+        handle_.get_stream());
 
       dst_cluster_weights_v = cugraph::experimental::collect_values_for_keys(
         handle_.get_comms(),
@@ -396,13 +402,13 @@ class Louvain {
         d_dst_cluster_cache_,
         d_dst_cluster_cache_ + dst_cluster_cache_v_.size(),
         vertex_to_gpu_id_op,
-        stream_);
+        handle_.get_stream());
 
       map_key_first   = d_dst_cluster_cache_;
       map_key_last    = d_dst_cluster_cache_ + dst_cluster_cache_v_.size();
       map_value_first = dst_cluster_weights_v.begin();
     } else {
-      thrust::transform(rmm::exec_policy(stream_)->on(stream_),
+      thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                         next_cluster_v.begin(),
                         next_cluster_v.end(),
                         src_cluster_weights_v.begin(),
@@ -432,7 +438,7 @@ class Louvain {
       map_key_first,
       map_key_last,
       map_value_first,
-      [rank = rank_, total_edge_weight, resolution] __device__(
+      [total_edge_weight, resolution] __device__(
         auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
         auto old_cluster_sum  = thrust::get<0>(src_info);
         auto k_k              = thrust::get<1>(src_info);
@@ -446,22 +452,6 @@ class Louvain {
                                          resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
                                            (total_edge_weight * total_edge_weight));
 
-#if 0
-          printf(
-            "rank = %d, src = %d, neighbor_cluster = %d, new_cluster_sum = %g, old_cluster_sum = "
-            "%g, "
-            "a_new = %g, a_old = %g, k_k = %g, delta_modularity = %g\n",
-            (int)rank,
-            (int)src,
-            (int)neighbor_cluster,
-            (float)new_cluster_sum,
-            (float)old_cluster_sum,
-            (float)a_new,
-            (float)a_old,
-            (float)k_k,
-            (float)delta_modularity);
-#endif
-
         return thrust::make_tuple(neighbor_cluster, delta_modularity);
       },
       [] __device__(auto p1, auto p2) {
@@ -477,33 +467,23 @@ class Louvain {
         output_buffer));
 
     thrust::transform(
-      rmm::exec_policy(stream_)->on(stream_),
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
       next_cluster_v.begin(),
       next_cluster_v.end(),
       cugraph::experimental::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(
         output_buffer),
       next_cluster_v.begin(),
-      [rank = rank_, up_down] __device__(vertex_t old_cluster, auto p) {
+      [up_down] __device__(vertex_t old_cluster, auto p) {
         vertex_t new_cluster      = thrust::get<0>(p);
         weight_t delta_modularity = thrust::get<1>(p);
 
-#if 0
-        printf(
-          "rank = %d, old_cluster = %d, new_cluster = %d, delta_modularity = %g, up_down = %s\n",
-          (int)rank,
-          (int)old_cluster,
-          (int)new_cluster,
-          (float)delta_modularity,
-          up_down ? "true" : false);
-#endif
-
         return (delta_modularity > weight_t{0})
                  ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
                  : old_cluster;
       });
 
-    std::tie(d_src_cluster_cache_, d_dst_cluster_cache_) =
-      cache_vertex_properties(next_cluster_v, src_cluster_cache_v_, dst_cluster_cache_v_);
+    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
+    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
 
     std::tie(cluster_keys_v_, cluster_weights_v_) =
       cugraph::experimental::transform_reduce_by_adj_matrix_row_key_e(
@@ -520,15 +500,15 @@ class Louvain {
   {
     timer_start("shrinking graph");
 
-    rmm::device_uvector<vertex_t> numbering_map(0, stream_);
+    rmm::device_uvector<vertex_t> numbering_map(0, handle_.get_stream());
 
     std::tie(current_graph_, numbering_map) =
       coarsen_graph(handle_, current_graph_view_, dendrogram_->current_level_begin());
 
     current_graph_view_ = current_graph_->view();
 
-    rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), stream_);
-    thrust::sequence(rmm::exec_policy(stream_)->on(stream_),
+    rmm::device_uvector<vertex_t> numbering_indices(numbering_map.size(), handle_.get_stream());
+    thrust::sequence(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                      numbering_indices.begin(),
                      numbering_indices.end(),
                      current_graph_view_.get_local_vertex_first());
@@ -541,17 +521,14 @@ class Louvain {
       dendrogram_->current_level_begin(),
       dendrogram_->current_level_size());
 
-    timer_stop(stream_);
+    timer_stop(handle_.get_stream());
   }
 
  protected:
   raft::handle_t const &handle_;
-  cudaStream_t stream_;
 
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram_;
 
-  int rank_{0};
-
   //
   //  Initially we run on the input graph view,
   //  but as we shrink the graph we'll keep the

From d660d4ce5b48045bda785b410d366a44538d3738 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Tue, 16 Mar 2021 15:34:18 -0400
Subject: [PATCH 07/18] rename louvain mg test file

---
 cpp/tests/CMakeLists.txt                                        | 2 +-
 .../community/{louvain_mg_test.cpp => mg_louvain_test.cpp}      | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename cpp/tests/community/{louvain_mg_test.cpp => mg_louvain_test.cpp} (100%)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d686125db29..80c7b8193fb 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -427,7 +427,7 @@ if(BUILD_CUGRAPH_MG_TESTS)
         # - MG LOUVAIN tests ---------------------------------------------------------------------
 
         set(MG_LOUVAIN_TEST_SRCS
-            "${CMAKE_CURRENT_SOURCE_DIR}/community/louvain_mg_test.cpp")
+            "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_test.cpp")
 
         ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}")
         target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
diff --git a/cpp/tests/community/louvain_mg_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
similarity index 100%
rename from cpp/tests/community/louvain_mg_test.cpp
rename to cpp/tests/community/mg_louvain_test.cpp

From c90f608b305e2d14f55a648a22ba96473604fe2a Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Tue, 16 Mar 2021 15:45:32 -0400
Subject: [PATCH 08/18] change Testparams to Usecase

---
 cpp/tests/community/mg_louvain_test.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index 3c14f56d3c8..e97f93d4424 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -37,7 +37,7 @@ void compare(double modularity, double sg_modularity)
 // will be instantiated as the parameter to the tests defined below using
 // INSTANTIATE_TEST_CASE_P()
 //
-struct Louvain_Testparams {
+struct Louvain_Usecase {
   std::string graph_file_full_path{};
   bool weighted{false};
   size_t max_level;
@@ -46,10 +46,10 @@ struct Louvain_Testparams {
   // FIXME:  We really should have a Graph_Testparms_Base class or something
   //         like that which can handle this graph_full_path thing.
   //
-  Louvain_Testparams(std::string const& graph_file_path,
-                     bool weighted,
-                     size_t max_level,
-                     double resolution)
+  Louvain_Usecase(std::string const& graph_file_path,
+                  bool weighted,
+                  size_t max_level,
+                  double resolution)
     : weighted(weighted), max_level(max_level), resolution(resolution)
   {
     if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
@@ -66,7 +66,7 @@ struct Louvain_Testparams {
 // test.  In this case, each test is identical except for the inputs and
 // expected outputs, so the entire test is defined in the run_test() method.
 //
-class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Testparams> {
+class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase> {
  public:
   static void SetUpTestCase() {}
   static void TearDownTestCase() {}
@@ -116,7 +116,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Testparam
   // iteration of the outer loop.  Renumbering of the partitions when coarsening
   // the graph is a function of the number of GPUs in the GPU cluster.
   template <typename vertex_t, typename edge_t, typename weight_t>
-  void run_test(const Louvain_Testparams& param)
+  void run_test(const Louvain_Usecase& param)
   {
     raft::handle_t handle;
 
@@ -183,7 +183,7 @@ TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float)
 INSTANTIATE_TEST_CASE_P(
   simple_test,
   Louvain_MG_Testfixture,
-  ::testing::Values(Louvain_Testparams("test/datasets/karate.mtx", true, 100, 1),
-                    Louvain_Testparams("test/datasets/smallworld.mtx", true, 100, 1)));
+  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1),
+                    Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1)));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()

From 0673a527400d6bd1643e15c75d981415e4e4d28d Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Thu, 25 Mar 2021 09:48:33 -0400
Subject: [PATCH 09/18] Address PR review items, improved testing

 1. Exposed flatten_dendrogram and a version of louvain that
    returns dendrogram in algorithms
 2. Added methods to the legacy graph implementations to make it
    easier to template between old and new graph objects
 3. Removed some class member data that can be derived
 4. Added code to suppress gtest output from rank > 0
---
 cpp/include/algorithms.hpp                    |  64 ++++
 .../dendrogram.cuh => include/dendrogram.hpp} |  26 +-
 cpp/include/graph.hpp                         |   8 +
 cpp/src/community/flatten_dendrogram.cuh      |   2 +-
 cpp/src/community/louvain.cu                  | 122 +++---
 cpp/src/community/louvain.cuh                 |   4 +-
 cpp/src/experimental/louvain.cuh              |   4 +-
 cpp/tests/CMakeLists.txt                      |   2 +
 cpp/tests/community/mg_louvain_helper.cu      | 353 ++++++++++++++++++
 cpp/tests/community/mg_louvain_helper.hpp     |  51 +++
 cpp/tests/community/mg_louvain_test.cpp       | 150 +++++---
 cpp/tests/utilities/base_fixture.hpp          |   5 +
 12 files changed, 677 insertions(+), 114 deletions(-)
 rename cpp/{src/community/dendrogram.cuh => include/dendrogram.hpp} (68%)
 create mode 100644 cpp/tests/community/mg_louvain_helper.cu
 create mode 100644 cpp/tests/community/mg_louvain_helper.hpp

diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index c3a4f3ec985..1c4a84a27e4 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -14,10 +14,14 @@
  * limitations under the License.
  */
 #pragma once
+
+#include <dendrogram.hpp>
 #include <experimental/graph.hpp>
 #include <experimental/graph_view.hpp>
+
 #include <graph.hpp>
 #include <internals.hpp>
+
 #include <raft/handle.hpp>
 
 namespace cugraph {
@@ -637,6 +641,66 @@ std::pair<size_t, typename graph_t::weight_type> louvain(
   size_t max_level                         = 100,
   typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
 
+/**
+ * @brief      Louvain implementation, returning dendrogram
+ *
+ * Compute a clustering of the graph by maximizing modularity
+ *
+ * Computed using the Louvain method described in:
+ *
+ *    VD Blondel, J-L Guillaume, R Lambiotte and E Lefebvre: Fast unfolding of
+ *    community hierarchies in large networks, J Stat Mech P10008 (2008),
+ *    http://arxiv.org/abs/0803.0476
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam     graph_t               Type of graph
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  graph                 input graph object (CSR)
+ * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
+ * @param[in]  resolution            (optional) The value of the resolution parameter to use.
+ *                                   Called gamma in the modularity formula, this changes the size
+ *                                   of the communities.  Higher resolutions lead to more smaller
+ *                                   communities, lower resolutions lead to fewer larger
+ *                                   communities. (default 1)
+ *
+ * @return                           a pair containing:
+ *                                     1) unique pointer to dendrogram
+ *                                     2) modularity of the returned clustering
+ *
+ */
+template <typename graph_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_t::vertex_type>>, typename graph_t::weight_type>
+louvain(raft::handle_t const &handle,
+        graph_t const &graph,
+        size_t max_level                         = 100,
+        typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
+
+/**
+ * @brief      Flatten a Dendrogram at a particular level
+ *
+ * A Dendrogram represents a hierarchical clustering/partitioning of
+ * a graph.  This function will flatten the hierarchical clustering into
+ * a label for each vertex representing the final cluster/partition to
+ * which it is assigned
+ *
+ * @throws     cugraph::logic_error when an error occurs.
+ *
+ * @tparam     graph_t               Type of graph
+ *
+ * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
+ * @param[in]  graph                 input graph object
+ * @param[in]  dendrogram            input dendrogram object
+ * @param[out] clustering            Pointer to device array where the clustering should be stored
+ *
+ */
+template <typename graph_t>
+void flatten_dendrogram(raft::handle_t const &handle,
+                        graph_t const &graph_view,
+                        Dendrogram<typename graph_t::vertex_type> const &dendrogram,
+                        typename graph_t::vertex_type *clustering);
+
 /**
  * @brief      Leiden implementation
  *
diff --git a/cpp/src/community/dendrogram.cuh b/cpp/include/dendrogram.hpp
similarity index 68%
rename from cpp/src/community/dendrogram.cuh
rename to cpp/include/dendrogram.hpp
index 9389de3292d..83a35110605 100644
--- a/cpp/src/community/dendrogram.cuh
+++ b/cpp/include/dendrogram.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/device_uvector.hpp>
 
 #include <memory>
 #include <vector>
@@ -30,27 +31,26 @@ class Dendrogram {
                  cudaStream_t stream                 = 0,
                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
-    level_ptr_.push_back(
-      std::make_unique<rmm::device_buffer>(num_verts * sizeof(vertex_t), stream, mr));
-    level_size_.push_back(num_verts);
+    level_ptr_.push_back(std::make_unique<rmm::device_uvector<vertex_t>>(num_verts, stream, mr));
     level_first_index_.push_back(first_index);
   }
 
-  size_t current_level() const { return level_size_.size() - 1; }
+  size_t current_level() const { return level_ptr_.size() - 1; }
 
-  size_t num_levels() const { return level_size_.size(); }
+  size_t num_levels() const { return level_ptr_.size(); }
 
   vertex_t const *get_level_ptr_nocheck(size_t level) const
   {
-    return static_cast<vertex_t const *>(level_ptr_[level]->data());
+    //return static_cast<vertex_t const *>(level_ptr_[level]->data());
+    return level_ptr_[level]->data();
   }
 
   vertex_t *get_level_ptr_nocheck(size_t level)
   {
-    return static_cast<vertex_t *>(level_ptr_[level]->data());
+    return level_ptr_[level]->data();
   }
 
-  vertex_t get_level_size_nocheck(size_t level) const { return level_size_[level]; }
+  size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); }
 
   vertex_t get_level_first_index_nocheck(size_t level) const { return level_first_index_[level]; }
 
@@ -62,14 +62,16 @@ class Dendrogram {
 
   vertex_t *current_level_end() { return current_level_begin() + current_level_size(); }
 
-  vertex_t current_level_size() const { return get_level_size_nocheck(current_level()); }
+  size_t current_level_size() const { return get_level_size_nocheck(current_level()); }
 
-  vertex_t current_level_first_index() const { return get_level_first_index_nocheck(current_level()); }
+  vertex_t current_level_first_index() const
+  {
+    return get_level_first_index_nocheck(current_level());
+  }
 
  private:
-  std::vector<vertex_t> level_size_;
   std::vector<vertex_t> level_first_index_;
-  std::vector<std::unique_ptr<rmm::device_buffer>> level_ptr_;
+  std::vector<std::unique_ptr<rmm::device_uvector<vertex_t>>> level_ptr_;
 };
 
 }  // namespace cugraph
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index b30159566b5..c4ca3b40c11 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -69,6 +69,14 @@ class GraphViewBase {
   edge_t *local_edges;
   vertex_t *local_offsets;
 
+  vertex_t get_number_of_vertices() const {
+    return number_of_vertices;
+  }
+
+  vertex_t get_local_vertex_first() const {
+    return vertex_t{0};
+  }
+
   /**
    * @brief      Fill the identifiers array with the vertex identifiers.
    *
diff --git a/cpp/src/community/flatten_dendrogram.cuh b/cpp/src/community/flatten_dendrogram.cuh
index 6961d7c9b87..6d455a68192 100644
--- a/cpp/src/community/flatten_dendrogram.cuh
+++ b/cpp/src/community/flatten_dendrogram.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <community/dendrogram.cuh>
+#include <dendrogram.hpp>
 #include <experimental/graph_functions.hpp>
 
 #include <rmm/thrust_rmm_allocator.h>
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index a851777ad93..ab27923cdb1 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -26,50 +26,28 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename edge_t, typename weight_t>
-std::pair<size_t, weight_t> louvain(raft::handle_t const &handle,
-                                    GraphCSRView<vertex_t, edge_t, weight_t> const &graph_view,
-                                    vertex_t *clustering,
-                                    size_t max_level,
-                                    weight_t resolution)
+std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
+  raft::handle_t const &handle,
+  GraphCSRView<vertex_t, edge_t, weight_t> const &graph_view,
+  size_t max_level,
+  weight_t resolution)
 {
   CUGRAPH_EXPECTS(graph_view.edge_data != nullptr,
                   "Invalid input argument: louvain expects a weighted graph");
-  CUGRAPH_EXPECTS(clustering != nullptr,
-                  "Invalid input argument: clustering is null, should be a device pointer to "
-                  "memory for storing the result");
 
   Louvain<GraphCSRView<vertex_t, edge_t, weight_t>> runner(handle, graph_view);
   weight_t wt = runner(max_level, resolution);
 
-  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices, handle.get_stream());
-
-  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                   vertex_ids_v.begin(),
-                   vertex_ids_v.end(),
-                   vertex_t{0});
-
-  partition_at_level<vertex_t, false>(handle,
-                                      runner.get_dendrogram(),
-                                      vertex_ids_v.data(),
-                                      clustering,
-                                      runner.get_dendrogram().num_levels());
-
-  // FIXME: Consider returning the Dendrogram at some point
-  return std::make_pair(runner.get_dendrogram().num_levels(), wt);
+  return std::make_pair(runner.move_dendrogram(), wt);
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
-std::pair<size_t, weight_t> louvain(
+std::pair<std::unique_ptr<Dendrogram<vertex_t>>, weight_t> louvain(
   raft::handle_t const &handle,
   experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
-  vertex_t *clustering,
   size_t max_level,
   weight_t resolution)
 {
-  CUGRAPH_EXPECTS(clustering != nullptr,
-                  "Invalid input argument: clustering is null, should be a device pointer to "
-                  "memory for storing the result");
-
   // "FIXME": remove this check and the guards below
   //
   // Disable louvain(experimental::graph_view_t,...)
@@ -87,27 +65,73 @@ std::pair<size_t, weight_t> louvain(
 
     weight_t wt = runner(max_level, resolution);
 
-    rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.get_number_of_vertices(),
-                                               handle.get_stream());
+    return std::make_pair(runner.move_dendrogram(), wt);
+  }
+}
 
-    thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                     vertex_ids_v.begin(),
-                     vertex_ids_v.end(),
-                     graph_view.get_local_vertex_first());
+template <typename vertex_t, typename edge_t, typename weight_t>
+void flatten_dendrogram(raft::handle_t const &handle,
+                        GraphCSRView<vertex_t, edge_t, weight_t> const &graph_view,
+                        Dendrogram<vertex_t> const &dendrogram,
+                        vertex_t *clustering)
+{
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.number_of_vertices, handle.get_stream());
 
-    partition_at_level<vertex_t, multi_gpu>(handle,
-                                            runner.get_dendrogram(),
-                                            vertex_ids_v.data(),
-                                            clustering,
-                                            runner.get_dendrogram().num_levels());
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   vertex_ids_v.begin(),
+                   vertex_ids_v.end(),
+                   vertex_t{0});
 
-    // FIXME: Consider returning the Dendrogram at some point
-    return std::make_pair(runner.get_dendrogram().num_levels(), wt);
-  }
+  partition_at_level<vertex_t, false>(handle,
+                                      dendrogram,
+                                      vertex_ids_v.data(),
+                                      clustering,
+                                      dendrogram.num_levels());
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+void flatten_dendrogram(
+  raft::handle_t const &handle,
+  experimental::graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const &graph_view,
+  Dendrogram<vertex_t> const &dendrogram,
+  vertex_t *clustering)
+{
+  rmm::device_uvector<vertex_t> vertex_ids_v(graph_view.get_number_of_vertices(),
+                                             handle.get_stream());
+
+  thrust::sequence(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   vertex_ids_v.begin(),
+                   vertex_ids_v.end(),
+                   graph_view.get_local_vertex_first());
+
+  partition_at_level<vertex_t, multi_gpu>(handle,
+                                          dendrogram,
+                                          vertex_ids_v.data(),
+                                          clustering,
+                                          dendrogram.num_levels());
 }
 
 }  // namespace detail
 
+template <typename graph_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_t::vertex_type>>, typename graph_t::weight_type>
+louvain(raft::handle_t const &handle,
+        graph_t const &graph,
+        size_t max_level,
+        typename graph_t::weight_type resolution)
+{
+  return detail::louvain(handle, graph, max_level, resolution);
+}
+
+template <typename graph_t>
+void flatten_dendrogram(raft::handle_t const &handle,
+                        graph_t const &graph_view,
+                        Dendrogram<typename graph_t::vertex_type> const &dendrogram,
+                        typename graph_t::vertex_type *clustering) {
+
+  detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
+}
+
 template <typename graph_t>
 std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &handle,
                                                          graph_t const &graph,
@@ -115,9 +139,19 @@ std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &h
                                                          size_t max_level,
                                                          typename graph_t::weight_type resolution)
 {
+  using vertex_t = typename graph_t::vertex_type;
+  using weight_t = typename graph_t::weight_type;
+
   CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
 
-  return detail::louvain(handle, graph, clustering, max_level, resolution);
+  std::unique_ptr<Dendrogram<vertex_t>> dendrogram;
+  weight_t modularity;
+
+  std::tie(dendrogram, modularity) = louvain(handle, graph, max_level, resolution);
+
+  flatten_dendrogram(handle, graph, *dendrogram, clustering);
+
+  return std::make_pair(dendrogram->num_levels(), modularity);
 }
 
 // Explicit template instantations
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index b81e619e747..0862bbc62a9 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -20,7 +20,7 @@
 #include <converters/COOtoCSR.cuh>
 #include <utilities/graph_utils.cuh>
 
-#include <community/dendrogram.cuh>
+#include <dendrogram.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -142,7 +142,7 @@ class Louvain {
 
   Dendrogram<vertex_t> &get_dendrogram() { return *dendrogram_; }
 
-  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return std::move(dendrogram_); }
 
   virtual weight_t operator()(size_t max_level, weight_t resolution)
   {
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 60086341e4b..9b2840aab3a 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <community/dendrogram.cuh>
+#include <dendrogram.hpp>
 
 #include <experimental/graph.hpp>
 #include <experimental/graph_functions.hpp>
@@ -73,7 +73,7 @@ class Louvain {
 
   Dendrogram<vertex_t> &get_dendrogram() { return *dendrogram_; }
 
-  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return dendrogram_; }
+  std::unique_ptr<Dendrogram<vertex_t>> move_dendrogram() { return std::move(dendrogram_); }
 
   virtual weight_t operator()(size_t max_level, weight_t resolution)
   {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 80c7b8193fb..b8745c4fb86 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -427,10 +427,12 @@ if(BUILD_CUGRAPH_MG_TESTS)
         # - MG LOUVAIN tests ---------------------------------------------------------------------
 
         set(MG_LOUVAIN_TEST_SRCS
+	    "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_helper.cu"
             "${CMAKE_CURRENT_SOURCE_DIR}/community/mg_louvain_test.cpp")
 
         ConfigureTest(MG_LOUVAIN_TEST "${MG_LOUVAIN_TEST_SRCS}")
         target_link_libraries(MG_LOUVAIN_TEST PRIVATE MPI::MPI_C MPI::MPI_CXX)
+        target_link_libraries(MG_LOUVAIN_TEST PRIVATE cugraph)
 
     else(MPI_CXX_FOUND)
        message(FATAL_ERROR "OpenMPI NOT found, cannot build MG tests.")
diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
new file mode 100644
index 00000000000..57e030da214
--- /dev/null
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "mg_louvain_helper.hpp"
+
+#include <experimental/graph.hpp>
+
+#include <utilities/device_comm.cuh>
+#include <utilities/error.hpp>
+#include <utilities/host_scalar_comm.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <thrust/for_each.h>
+#include <thrust/reduce.h>
+#include <thrust/transform.h>
+
+namespace cugraph {
+namespace test {
+
+template <typename T>
+rmm::device_uvector<T> gather_distributed_vector(raft::handle_t const &handle,
+                                                 T const *d_input,
+                                                 size_t size)
+{
+  auto rx_sizes =
+    cugraph::experimental::host_scalar_gather(handle.get_comms(), size, 0, handle.get_stream());
+  std::vector<size_t> rx_displs(static_cast<size_t>(handle.get_comms().get_rank()) == 0
+                                  ? handle.get_comms().get_size()
+                                  : int{0},
+                                size_t{0});
+  if (static_cast<size_t>(handle.get_comms().get_rank()) == 0) {
+    std::partial_sum(rx_sizes.begin(), rx_sizes.end() - 1, rx_displs.begin() + 1);
+  }
+
+  auto total_size = thrust::reduce(thrust::host, rx_sizes.begin(), rx_sizes.end());
+  rmm::device_uvector<T> gathered_v(total_size, handle.get_stream());
+
+  cugraph::experimental::device_gatherv(handle.get_comms(),
+                                        d_input,
+                                        gathered_v.data(),
+                                        size,
+                                        rx_sizes,
+                                        rx_displs,
+                                        0,
+                                        handle.get_stream());
+
+  return gathered_v;
+}
+
+template <typename vertex_t>
+bool compare_renumbered_vectors(raft::handle_t const &handle,
+                                rmm::device_uvector<vertex_t> const &v1,
+                                rmm::device_uvector<vertex_t> const &v2)
+{
+  vertex_t max = 1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                                    v1.begin(),
+                                    v1.end(),
+                                    vertex_t{0});
+
+  rmm::device_uvector<size_t> map(max, size_t{0});
+
+  auto iter = thrust::make_zip_iterator(thrust::make_tuple(v1.begin(), v2.begin()));
+
+  thrust::for_each(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                   iter,
+                   iter + v1.size(),
+                   [d_map = map.data()] __device__(auto pair) {
+                     vertex_t e1 = thrust::get<0>(pair);
+                     vertex_t e2 = thrust::get<1>(pair);
+
+                     d_map[e1] = e2;
+                   });
+
+  auto error_count = thrust::count_if(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    iter,
+    iter + v1.size(),
+    [d_map = map.data()] __device__(auto pair) {
+      vertex_t e1 = thrust::get<0>(pair);
+      vertex_t e2 = thrust::get<1>(pair);
+
+      return (d_map[e1] != e2);
+    });
+
+  return (error_count == 0);
+}
+
+template <typename T>
+void single_gpu_renumber_edgelist_given_number_map(raft::handle_t const &handle,
+                                                   rmm::device_uvector<T> &edgelist_rows_v,
+                                                   rmm::device_uvector<T> &edgelist_cols_v,
+                                                   rmm::device_uvector<T> &renumber_map_gathered_v)
+{
+  rmm::device_uvector<T> index_v(renumber_map_gathered_v.size(), handle.get_stream());
+
+  thrust::for_each(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator<size_t>(0),
+    thrust::make_counting_iterator<size_t>(renumber_map_gathered_v.size()),
+    [d_renumber_map_gathered = renumber_map_gathered_v.data(), d_index = index_v.data()] __device__(
+      auto idx) { d_index[d_renumber_map_gathered[idx]] = idx; });
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    edgelist_rows_v.begin(),
+                    edgelist_rows_v.end(),
+                    edgelist_rows_v.begin(),
+                    [d_index = index_v.data()] __device__(auto v) { return d_index[v]; });
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    edgelist_cols_v.begin(),
+                    edgelist_cols_v.end(),
+                    edgelist_cols_v.begin(),
+                    [d_index = index_v.data()] __device__(auto v) { return d_index[v]; });
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
+  compressed_sparse_to_edgelist(edge_t const *compressed_sparse_offsets,
+                                vertex_t const *compressed_sparse_indices,
+                                weight_t const *compressed_sparse_weights,
+                                vertex_t major_first,
+                                vertex_t major_last,
+                                cudaStream_t stream)
+{
+  edge_t number_of_edges{0};
+  raft::update_host(
+    &number_of_edges, compressed_sparse_offsets + (major_last - major_first), 1, stream);
+  CUDA_TRY(cudaStreamSynchronize(stream));
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(number_of_edges, stream);
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(number_of_edges, stream);
+  rmm::device_uvector<weight_t> edgelist_weights(
+    compressed_sparse_weights != nullptr ? number_of_edges : 0, stream);
+
+  // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can
+  // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA
+  // warp per vertex, and low-degree vertices using one CUDA thread per block
+  thrust::for_each(rmm::exec_policy(stream)->on(stream),
+                   thrust::make_counting_iterator(major_first),
+                   thrust::make_counting_iterator(major_last),
+                   [compressed_sparse_offsets,
+                    major_first,
+                    p_majors = edgelist_major_vertices.begin()] __device__(auto v) {
+                     auto first = compressed_sparse_offsets[v - major_first];
+                     auto last  = compressed_sparse_offsets[v - major_first + 1];
+                     thrust::fill(thrust::seq, p_majors + first, p_majors + last, v);
+                   });
+  thrust::copy(rmm::exec_policy(stream)->on(stream),
+               compressed_sparse_indices,
+               compressed_sparse_indices + number_of_edges,
+               edgelist_minor_vertices.begin());
+  if (compressed_sparse_weights != nullptr) {
+    thrust::copy(rmm::exec_policy(stream)->on(stream),
+                 compressed_sparse_weights,
+                 compressed_sparse_weights + number_of_edges,
+                 edgelist_weights.data());
+  }
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+template <typename vertex_t, typename weight_t>
+void sort_and_coarsen_edgelist(rmm::device_uvector<vertex_t> &edgelist_major_vertices /* [INOUT] */,
+                               rmm::device_uvector<vertex_t> &edgelist_minor_vertices /* [INOUT] */,
+                               rmm::device_uvector<weight_t> &edgelist_weights /* [INOUT] */,
+                               cudaStream_t stream)
+{
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+
+  size_t number_of_edges{0};
+  if (edgelist_weights.size() > 0) {
+    thrust::sort_by_key(rmm::exec_policy(stream)->on(stream),
+                        pair_first,
+                        pair_first + edgelist_major_vertices.size(),
+                        edgelist_weights.begin());
+
+    rmm::device_uvector<vertex_t> tmp_edgelist_major_vertices(edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<vertex_t> tmp_edgelist_minor_vertices(tmp_edgelist_major_vertices.size(),
+                                                              stream);
+    rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
+    auto it = thrust::reduce_by_key(
+      rmm::exec_policy(stream)->on(stream),
+      pair_first,
+      pair_first + edgelist_major_vertices.size(),
+      edgelist_weights.begin(),
+      thrust::make_zip_iterator(thrust::make_tuple(tmp_edgelist_major_vertices.begin(),
+                                                   tmp_edgelist_minor_vertices.begin())),
+      tmp_edgelist_weights.begin());
+    number_of_edges = thrust::distance(tmp_edgelist_weights.begin(), thrust::get<1>(it));
+
+    edgelist_major_vertices = std::move(tmp_edgelist_major_vertices);
+    edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
+    edgelist_weights        = std::move(tmp_edgelist_weights);
+  } else {
+    thrust::sort(rmm::exec_policy(stream)->on(stream),
+                 pair_first,
+                 pair_first + edgelist_major_vertices.size());
+    auto it         = thrust::unique(rmm::exec_policy(stream)->on(stream),
+                             pair_first,
+                             pair_first + edgelist_major_vertices.size());
+    number_of_edges = thrust::distance(pair_first, it);
+  }
+
+  edgelist_major_vertices.resize(number_of_edges, stream);
+  edgelist_minor_vertices.resize(number_of_edges, stream);
+  edgelist_weights.resize(number_of_edges, stream);
+  edgelist_major_vertices.shrink_to_fit(stream);
+  edgelist_minor_vertices.shrink_to_fit(stream);
+  edgelist_weights.shrink_to_fit(stream);
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<weight_t>>
+  compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+    edge_t const *compressed_sparse_offsets,
+    vertex_t const *compressed_sparse_indices,
+    weight_t const *compressed_sparse_weights,
+    vertex_t const *p_major_labels,
+    vertex_t const *p_minor_labels,
+    vertex_t major_first,
+    vertex_t major_last,
+    vertex_t minor_first,
+    vertex_t minor_last,
+    cudaStream_t stream)
+{
+  // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
+  // compressed sparse format to save memory
+
+  rmm::device_uvector<vertex_t> edgelist_major_vertices(0, stream);
+  rmm::device_uvector<vertex_t> edgelist_minor_vertices(0, stream);
+  rmm::device_uvector<weight_t> edgelist_weights(0, stream);
+  std::tie(edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights) =
+    compressed_sparse_to_edgelist(compressed_sparse_offsets,
+                                  compressed_sparse_indices,
+                                  compressed_sparse_weights,
+                                  major_first,
+                                  major_last,
+                                  stream);
+
+  auto pair_first = thrust::make_zip_iterator(
+    thrust::make_tuple(edgelist_major_vertices.begin(), edgelist_minor_vertices.begin()));
+  thrust::transform(
+    rmm::exec_policy(stream)->on(stream),
+    pair_first,
+    pair_first + edgelist_major_vertices.size(),
+    pair_first,
+    [p_major_labels, p_minor_labels, major_first, minor_first] __device__(auto val) {
+      return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first],
+                                p_minor_labels[thrust::get<1>(val) - minor_first]);
+    });
+
+  sort_and_coarsen_edgelist(
+    edgelist_major_vertices, edgelist_minor_vertices, edgelist_weights, stream);
+
+  return std::make_tuple(std::move(edgelist_major_vertices),
+                         std::move(edgelist_minor_vertices),
+                         std::move(edgelist_weights));
+}
+
+// single-GPU version
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::unique_ptr<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+coarsen_graph(
+  raft::handle_t const &handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const
+    &graph_view,
+  vertex_t const *labels)
+{
+  rmm::device_uvector<vertex_t> coarsened_edgelist_major_vertices(0, handle.get_stream());
+  rmm::device_uvector<vertex_t> coarsened_edgelist_minor_vertices(0, handle.get_stream());
+  rmm::device_uvector<weight_t> coarsened_edgelist_weights(0, handle.get_stream());
+  std::tie(coarsened_edgelist_major_vertices,
+           coarsened_edgelist_minor_vertices,
+           coarsened_edgelist_weights) =
+    compressed_sparse_to_relabeled_and_sorted_and_coarsened_edgelist(
+      graph_view.offsets(),
+      graph_view.indices(),
+      graph_view.weights(),
+      labels,
+      labels,
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      vertex_t{0},
+      graph_view.get_number_of_vertices(),
+      handle.get_stream());
+
+  cugraph::experimental::edgelist_t<vertex_t, edge_t, weight_t> edgelist{};
+  edgelist.p_src_vertices = store_transposed ? coarsened_edgelist_minor_vertices.data()
+                                             : coarsened_edgelist_major_vertices.data();
+  edgelist.p_dst_vertices = store_transposed ? coarsened_edgelist_major_vertices.data()
+                                             : coarsened_edgelist_minor_vertices.data();
+  edgelist.p_edge_weights  = coarsened_edgelist_weights.data();
+  edgelist.number_of_edges = static_cast<edge_t>(coarsened_edgelist_major_vertices.size());
+
+  vertex_t new_number_of_vertices =
+    1 + thrust::reduce(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                       labels,
+                       labels + graph_view.get_number_of_vertices(),
+                       vertex_t{0},
+                       thrust::maximum<vertex_t>());
+
+  return std::make_unique<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>(
+    handle,
+    edgelist,
+    new_number_of_vertices,
+    cugraph::experimental::graph_properties_t{graph_view.is_symmetric(), false},
+    true);
+}
+
+// explicit instantiation
+
+template void single_gpu_renumber_edgelist_given_number_map(
+  raft::handle_t const &handle,
+  rmm::device_uvector<int> &d_edgelist_rows,
+  rmm::device_uvector<int> &d_edgelist_cols,
+  rmm::device_uvector<int> &d_renumber_map_gathered_v);
+
+template rmm::device_uvector<int> gather_distributed_vector(raft::handle_t const &handle,
+                                                            int const *d_input,
+                                                            size_t size);
+
+template bool compare_renumbered_vectors(raft::handle_t const &handle,
+                                         rmm::device_uvector<int> const &v1,
+                                         rmm::device_uvector<int> const &v2);
+
+template std::unique_ptr<cugraph::experimental::graph_t<int32_t, int32_t, float, false, false>>
+coarsen_graph(
+  raft::handle_t const &handle,
+  cugraph::experimental::graph_view_t<int32_t, int32_t, float, false, false> const &graph_view,
+  int32_t const *labels);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp
new file mode 100644
index 00000000000..87d46907bfc
--- /dev/null
+++ b/cpp/tests/community/mg_louvain_helper.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <experimental/graph.hpp>
+
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+
+namespace cugraph {
+namespace test {
+
+template <typename T>
+rmm::device_uvector<T> gather_distributed_vector(raft::handle_t const &handle,
+                                                 T const *d_input,
+                                                 size_t size);
+
+template <typename vertex_t>
+bool compare_renumbered_vectors(raft::handle_t const &handle,
+                                rmm::device_uvector<vertex_t> const &v1,
+                                rmm::device_uvector<vertex_t> const &v2);
+
+template <typename T>
+void single_gpu_renumber_edgelist_given_number_map(
+  raft::handle_t const &handle,
+  rmm::device_uvector<T> &d_edgelist_rows,
+  rmm::device_uvector<T> &d_edgelist_cols,
+  rmm::device_uvector<T> &d_renumber_map_gathered_v);
+
+template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
+std::unique_ptr<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
+coarsen_graph(raft::handle_t const &handle,
+              cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const &graph_view,
+              vertex_t const *labels);
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index e97f93d4424..d62a0c8eae1 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -14,16 +14,21 @@
  * limitations under the License.
  */
 
+#include "mg_louvain_helper.hpp"
+
 #include <utilities/base_fixture.hpp>
 #include <utilities/test_utilities.hpp>
 
 #include <algorithms.hpp>
 #include <partition_manager.hpp>
 
+#include <raft/cudart_utils.h>
 #include <raft/comms/comms.hpp>
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/handle.hpp>
 
+#include <thrust/sequence.h>
+
 #include <gtest/gtest.h>
 
 void compare(float modularity, float sg_modularity) { ASSERT_FLOAT_EQ(modularity, sg_modularity); }
@@ -75,39 +80,87 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
   virtual void SetUp() {}
   virtual void TearDown() {}
 
-  // Return the results of running louvain on a single GPU for the dataset in
-  // graph_file_path.
+  // Compare the results of MNMG Louvain with the results of running
+  // each step of SG Louvain, renumbering the coarsened graphs based
+  // on the MNMG renumbering.
   template <typename vertex_t, typename edge_t, typename weight_t>
-  std::tuple<int, weight_t, std::vector<vertex_t>> get_sg_results(
-    raft::handle_t const& handle,
-    std::string const& graph_file_path,
-    size_t max_level,
-    weight_t resolution)
+  void compare_sg_results(raft::handle_t const& handle,
+                          std::string const& graph_filename,
+                          rmm::device_uvector<vertex_t>& d_renumber_map_gathered_v,
+                          cugraph::Dendrogram<vertex_t> const& dendrogram,
+                          weight_t resolution,
+                          int rank,
+                          weight_t modularity)
   {
-    // FIXME:  Put this in the Graph test base class
-    //         (make the call simpler here)
-    auto graph_tuple =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
-        handle,
-        graph_file_path,
-        true,
-        false);  // FIXME: should use param.test_weighted instead of true
-
-    auto graph_view     = std::get<0>(graph_tuple).view();
-    cudaStream_t stream = handle.get_stream();
+    auto sg_graph =
+      std::make_unique<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false>>(
+        handle);
+    rmm::device_uvector<vertex_t> d_clustering_v(0, handle.get_stream());
+    weight_t sg_modularity;
 
-    rmm::device_uvector<vertex_t> clustering_v(graph_view.get_number_of_local_vertices(), stream);
+    if (rank == 0) {
+      // Create initial SG graph, renumbered according to the MNMG renumber map
+      rmm::device_uvector<vertex_t> d_edgelist_rows(0, handle.get_stream());
+      rmm::device_uvector<vertex_t> d_edgelist_cols(0, handle.get_stream());
+      rmm::device_uvector<weight_t> d_edgelist_weights(0, handle.get_stream());
+      vertex_t number_of_vertices{};
+      bool is_symmetric{};
+
+      std::tie(
+        d_edgelist_rows, d_edgelist_cols, d_edgelist_weights, number_of_vertices, is_symmetric) =
+        cugraph::test::read_edgelist_from_matrix_market_file<vertex_t, weight_t>(
+          handle, graph_filename, true);
+
+      rmm::device_uvector<vertex_t> d_vertices(number_of_vertices, handle.get_stream());
+      std::vector<vertex_t> h_vertices(number_of_vertices);
+
+      d_clustering_v.resize(d_vertices.size(), handle.get_stream());
+
+      thrust::sequence(thrust::host, h_vertices.begin(), h_vertices.end(), vertex_t{0});
+      raft::update_device(
+        d_vertices.data(), h_vertices.data(), d_vertices.size(), handle.get_stream());
+
+      // renumber using d_renumber_map_gathered_v
+      cugraph::test::single_gpu_renumber_edgelist_given_number_map(
+        handle, d_edgelist_rows, d_edgelist_cols, d_renumber_map_gathered_v);
+
+      std::tie(*sg_graph, std::ignore) =
+        cugraph::test::generate_graph_from_edgelist<vertex_t, edge_t, weight_t, false, false>(
+          handle,
+          std::move(d_vertices),
+          std::move(d_edgelist_rows),
+          std::move(d_edgelist_cols),
+          std::move(d_edgelist_weights),
+          is_symmetric,
+          true,
+          false);
+    }
 
-    size_t level;
-    weight_t modularity;
+    std::for_each(
+      thrust::make_counting_iterator<size_t>(0),
+      thrust::make_counting_iterator<size_t>(dendrogram.num_levels()),
+      [&dendrogram, &sg_graph, &d_clustering_v, &sg_modularity, &handle, resolution, rank](
+        size_t i) {
+        auto d_dendrogram_gathered_v = cugraph::test::gather_distributed_vector(
+          handle, dendrogram.get_level_ptr_nocheck(i), dendrogram.get_level_size_nocheck(i));
+
+        if (rank == 0) {
+          auto graph_view = sg_graph->view();
+
+          d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream());
 
-    std::tie(level, modularity) =
-      cugraph::louvain(handle, graph_view, clustering_v.data(), max_level, resolution);
+          std::tie(std::ignore, sg_modularity) =
+            cugraph::louvain(handle, graph_view, d_clustering_v.data(), 1, resolution);
 
-    std::vector<vertex_t> clustering(graph_view.get_number_of_local_vertices());
-    raft::update_host(clustering.data(), clustering_v.data(), clustering_v.size(), stream);
+          EXPECT_TRUE(cugraph::test::compare_renumbered_vectors(
+            handle, d_clustering_v, d_dendrogram_gathered_v));
 
-    return std::make_tuple(level, modularity, clustering);
+          sg_graph =
+            cugraph::test::coarsen_graph(handle, graph_view, d_dendrogram_gathered_v.data());
+        }
+      });
+
+    if (rank == 0) compare(modularity, sg_modularity);
   }
 
   // Compare the results of running louvain on multiple GPUs to that of a
@@ -134,43 +187,33 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
     cudaStream_t stream = handle.get_stream();
 
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, true> mg_graph(handle);
+
     rmm::device_uvector<vertex_t> d_renumber_map_labels(0, handle.get_stream());
 
     std::tie(mg_graph, d_renumber_map_labels) =
       cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, true>(
-        handle, param.graph_file_full_path, true, false);
-
-    // Each GPU will have a subset of the clustering
-    int sg_level;
-    weight_t sg_modularity;
-    std::vector<vertex_t> sg_clustering;
-
-    // FIXME:  Consider how to test for max_level > 1
-    //         perhaps some sort of approximation
-    // size_t local_max_level{param.max_level};
-    size_t local_max_level{1};
+        handle, param.graph_file_full_path, true, true);
 
     auto mg_graph_view = mg_graph.view();
 
-    rmm::device_uvector<vertex_t> clustering_v(mg_graph_view.get_number_of_local_vertices(),
-                                               stream);
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    int level;
+    std::unique_ptr<cugraph::Dendrogram<vertex_t>> dendrogram;
     weight_t modularity;
 
-    std::tie(level, modularity) = cugraph::louvain(
-      handle, mg_graph_view, clustering_v.data(), local_max_level, param.resolution);
+    std::tie(dendrogram, modularity) =
+      cugraph::louvain(handle, mg_graph_view, param.max_level, param.resolution);
 
-    if (comm_rank == 0) {
-      SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path);
+    SCOPED_TRACE("compare modularity input: " + param.graph_file_full_path);
 
-      std::tie(sg_level, sg_modularity, sg_clustering) = get_sg_results<vertex_t, edge_t, weight_t>(
-        handle, param.graph_file_full_path, local_max_level, param.resolution);
+    auto d_renumber_map_gathered_v = cugraph::test::gather_distributed_vector(
+      handle, d_renumber_map_labels.data(), d_renumber_map_labels.size());
 
-      compare(modularity, sg_modularity);
-    }
+    compare_sg_results<vertex_t, edge_t, weight_t>(handle,
+                                                   param.graph_file_full_path,
+                                                   d_renumber_map_gathered_v,
+                                                   *dendrogram,
+                                                   param.resolution,
+                                                   comm_rank,
+                                                   modularity);
   }
 };
 
@@ -183,7 +226,8 @@ TEST_P(Louvain_MG_Testfixture, CheckInt32Int32Float)
 INSTANTIATE_TEST_CASE_P(
   simple_test,
   Louvain_MG_Testfixture,
-  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1),
-                    Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1)));
+  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 100, 1)
+                    //,Louvain_Usecase("test/datasets/smallworld.mtx", true, 100, 1)
+                    ));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index e8f11acfbf4..78ac57dba58 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -160,6 +160,11 @@ inline auto parse_test_options(int argc, char **argv)
     auto const cmd_opts = parse_test_options(argc, argv);                             \
     auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();                     \
     auto resource       = cugraph::test::create_memory_resource(rmm_mode);            \
+                                                                                      \
+    if (comm_rank != 0) {                                                             \
+      auto& listeners = ::testing::UnitTest::GetInstance()->listeners();              \
+      delete listeners.Release(listeners.default_result_printer());                   \
+    }                                                                                 \
     rmm::mr::set_current_device_resource(resource.get());                             \
     auto ret = RUN_ALL_TESTS();                                                       \
     MPI_TRY(MPI_Finalize());                                                          \

From b9cdf406a6d6426cc73fbf320fecf56acde97a55 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 13:44:57 -0400
Subject: [PATCH 10/18] clean up serial louvain tests

---
 cpp/include/algorithms.hpp             |   4 +-
 cpp/src/experimental/louvain.cuh       |   8 +-
 cpp/tests/CMakeLists.txt               |   8 --
 cpp/tests/community/louvain_test.cpp   | 160 +++++++++++++++++++++++--
 cpp/tests/experimental/louvain_test.cu | 133 --------------------
 5 files changed, 158 insertions(+), 155 deletions(-)
 delete mode 100644 cpp/tests/experimental/louvain_test.cu

diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index 1c4a84a27e4..beb4333d508 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -656,8 +656,8 @@ std::pair<size_t, typename graph_t::weight_type> louvain(
  *
  * @tparam     graph_t               Type of graph
  *
- * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
- * @param[in]  graph                 input graph object (CSR)
+ * @param[in]  handle                Library handle (RAFT)
+ * @param[in]  graph                 Input graph object (CSR)
  * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
  * @param[in]  resolution            (optional) The value of the resolution parameter to use.
  *                                   Called gamma in the modularity formula, this changes the size
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 9b2840aab3a..30c9c1cfc4d 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -29,6 +29,7 @@
 #include <utilities/collect_comm.cuh>
 
 #include <thrust/transform_reduce.h>
+#include <thrust/binary_search.h>
 
 //#define TIMING
 
@@ -408,6 +409,11 @@ class Louvain {
       map_key_last    = d_dst_cluster_cache_ + dst_cluster_cache_v_.size();
       map_value_first = dst_cluster_weights_v.begin();
     } else {
+      thrust::sort_by_key(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                          cluster_keys_v_.begin(),
+                          cluster_keys_v_.end(),
+                          cluster_weights_v_.begin());
+      
       thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                         next_cluster_v.begin(),
                         next_cluster_v.end(),
@@ -415,7 +421,7 @@ class Louvain {
                         [d_cluster_weights = cluster_weights_v_.data(),
                          d_cluster_keys    = cluster_keys_v_.data(),
                          num_clusters      = cluster_keys_v_.size()] __device__(vertex_t cluster) {
-                          auto pos = thrust::find(
+                          auto pos = thrust::lower_bound(
                             thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster);
                           return d_cluster_weights[pos - d_cluster_keys];
                         });
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9b8c1b9322e..97d3f77e26e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -391,14 +391,6 @@ set(EXPERIMENTAL_PAGERANK_TEST_SRCS
 
 ConfigureTest(EXPERIMENTAL_PAGERANK_TEST "${EXPERIMENTAL_PAGERANK_TEST_SRCS}")
 
-###################################################################################################
-# - Experimental LOUVAIN tests -------------------------------------------------------------------
-
-set(EXPERIMENTAL_LOUVAIN_TEST_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/louvain_test.cu")
-
-ConfigureTest(EXPERIMENTAL_LOUVAIN_TEST "${EXPERIMENTAL_LOUVAIN_TEST_SRCS}")
-
 ###################################################################################################
 # - Experimental KATZ_CENTRALITY tests ------------------------------------------------------------
 
diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index d3024282be3..ab92ce411b9 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -9,15 +9,141 @@
  *
  */
 #include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <experimental/graph.hpp>
 
 #include <algorithms.hpp>
-#include <graph.hpp>
 
-#include <thrust/extrema.h>
+#include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+struct Louvain_Usecase {
+  std::string graph_file_full_path_{};
+  bool test_weighted_{false};
+  int expected_level_{0};
+  float expected_modularity_{0};
+
+  Louvain_Usecase(std::string const& graph_file_path,
+                  bool test_weighted,
+                  int expected_level,
+                  float expected_modularity)
+    : test_weighted_(test_weighted),
+      expected_level_(expected_level),
+      expected_modularity_(expected_modularity)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path_ = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path_ = graph_file_path;
+    }
+  };
+};
+
+class Tests_Louvain : public ::testing::TestWithParam<Louvain_Usecase> {
+ public:
+  Tests_Louvain() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_legacy_test(Louvain_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    bool directed{false};
+
+    auto graph = cugraph::test::generate_graph_csr_from_mm<vertex_t, edge_t, weight_t>(
+      directed, configuration.graph_file_full_path_);
+    auto graph_view = graph->view();
+
+    louvain(graph_view,
+            graph_view.get_number_of_vertices(),
+            configuration.expected_level_,
+            configuration.expected_modularity_);
+  }
 
-#include <rmm/device_uvector.hpp>
+  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
+  void run_current_test(Louvain_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+    std::tie(graph, std::ignore) =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle, configuration.graph_file_full_path_, configuration.test_weighted_, false);
+
+    auto graph_view = graph.view();
+
+    // "FIXME": remove this check once we drop support for Pascal
+    //
+    // Calling louvain on Pascal will throw an exception, we'll check that
+    // this is the behavior while we still support Pascal (device_prop.major < 7)
+    //
+    cudaDeviceProp device_prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
+
+    if (device_prop.major < 7) {
+      EXPECT_THROW(louvain(graph_view,
+                           graph_view.get_number_of_local_vertices(),
+                           configuration.expected_level_,
+                           configuration.expected_modularity_),
+                   cugraph::logic_error);
+    } else {
+      louvain(graph_view,
+              graph_view.get_number_of_local_vertices(),
+              configuration.expected_level_,
+              configuration.expected_modularity_);
+    }
+  }
+
+  template <typename graph_t>
+  void louvain(graph_t const& graph_view,
+               typename graph_t::vertex_type num_vertices,
+               int expected_level,
+               float expected_modularity)
+  {
+    using vertex_t = typename graph_t::vertex_type;
+    using weight_t = typename graph_t::weight_type;
+
+    raft::handle_t handle{};
+
+    rmm::device_uvector<vertex_t> clustering_v(num_vertices, handle.get_stream());
+    size_t level;
+    weight_t modularity;
+
+    std::tie(level, modularity) =
+      cugraph::louvain(handle, graph_view, clustering_v.data(), size_t{100}, weight_t{1});
+
+    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+
+    float compare_modularity = static_cast<float>(modularity);
 
-TEST(louvain, success)
+    ASSERT_FLOAT_EQ(compare_modularity, expected_modularity);
+    ASSERT_EQ(level, expected_level);
+  }
+};
+
+// FIXME: add tests for type combinations
+
+TEST(louvain_legacy, success)
 {
   raft::handle_t handle;
 
@@ -84,15 +210,13 @@ TEST(louvain, success)
 
     int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-    std::cout << "modularity = " << modularity << std::endl;
-
     ASSERT_GE(min, 0);
-    ASSERT_GE(modularity, 0.402777 * 0.95);
+    ASSERT_FLOAT_EQ(modularity, 0.408695);
     ASSERT_EQ(cluster_id, result_h);
   }
 }
 
-TEST(louvain_renumbered, success)
+TEST(louvain_legacy_renumbered, success)
 {
   raft::handle_t handle;
 
@@ -157,11 +281,25 @@ TEST(louvain_renumbered, success)
 
     int min = *min_element(cluster_id.begin(), cluster_id.end());
 
-    std::cout << "modularity = " << modularity << std::endl;
-
     ASSERT_GE(min, 0);
-    ASSERT_GE(modularity, 0.402777 * 0.95);
+    ASSERT_FLOAT_EQ(modularity, 0.41880345);
   }
 }
 
+TEST_P(Tests_Louvain, CheckInt32Int32FloatFloatLegacy)
+{
+  run_legacy_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat)
+{
+  run_current_test<int32_t, int32_t, float, float>(GetParam());
+}
+
+// FIXME: Expand testing once we evaluate RMM memory use
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_Louvain,
+  ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true, 3, 0.408695)));
+
 CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/louvain_test.cu b/cpp/tests/experimental/louvain_test.cu
deleted file mode 100644
index 56fb2c109bf..00000000000
--- a/cpp/tests/experimental/louvain_test.cu
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governin_from_mtxg permissions and
- * limitations under the License.
- */
-
-#include <utilities/base_fixture.hpp>
-#include <utilities/test_utilities.hpp>
-
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-
-#include <experimental/graph.hpp>
-#include <experimental/louvain.cuh>
-
-#include <algorithms.hpp>
-
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <iterator>
-#include <limits>
-#include <numeric>
-#include <vector>
-
-typedef struct Louvain_Usecase_t {
-  std::string graph_file_full_path{};
-  bool test_weighted{false};
-
-  Louvain_Usecase_t(std::string const& graph_file_path, bool test_weighted)
-    : test_weighted(test_weighted)
-  {
-    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
-      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
-    } else {
-      graph_file_full_path = graph_file_path;
-    }
-  };
-} Louvain_Usecase;
-
-class Tests_Louvain : public ::testing::TestWithParam<Louvain_Usecase> {
- public:
-  Tests_Louvain() {}
-  static void SetupTestCase() {}
-  static void TearDownTestCase() {}
-
-  virtual void SetUp() {}
-  virtual void TearDown() {}
-
-  template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>
-  void run_current_test(Louvain_Usecase const& configuration)
-  {
-    raft::handle_t handle{};
-
-    std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl;
-
-    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
-    std::tie(graph, std::ignore) =
-      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
-        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
-
-    auto graph_view = graph.view();
-
-    // "FIXME": remove this check once we drop support for Pascal
-    //
-    // Calling louvain on Pascal will throw an exception, we'll check that
-    // this is the behavior while we still support Pascal (device_prop.major < 7)
-    //
-    cudaDeviceProp device_prop;
-    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
-
-    if (device_prop.major < 7) {
-      EXPECT_THROW(louvain(graph_view), cugraph::logic_error);
-    } else {
-      louvain(graph_view);
-    }
-  }
-
-  template <typename graph_t>
-  void louvain(graph_t const& graph_view)
-  {
-    using vertex_t = typename graph_t::vertex_type;
-    using weight_t = typename graph_t::weight_type;
-
-    raft::handle_t handle{};
-
-    rmm::device_vector<vertex_t> clustering_v(graph_view.get_number_of_local_vertices());
-    size_t level;
-    weight_t modularity;
-
-    std::tie(level, modularity) =
-      cugraph::louvain(handle, graph_view, clustering_v.data().get(), size_t{100}, weight_t{1});
-
-    CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
-
-    std::cout << "level = " << level << std::endl;
-    std::cout << "modularity = " << modularity << std::endl;
-  }
-};
-
-// FIXME: add tests for type combinations
-TEST_P(Tests_Louvain, CheckInt32Int32FloatFloat)
-{
-  run_current_test<int32_t, int32_t, float, float>(GetParam());
-}
-
-INSTANTIATE_TEST_CASE_P(simple_test,
-                        Tests_Louvain,
-                        ::testing::Values(Louvain_Usecase("test/datasets/karate.mtx", true)
-#if 0
-			,
-                                          Louvain_Usecase("test/datasets/web-Google.mtx", true),
-                                          Louvain_Usecase("test/datasets/ljournal-2008.mtx", true),
-                                          Louvain_Usecase("test/datasets/webbase-1M.mtx", true)
-#endif
-                                            ));
-
-CUGRAPH_TEST_PROGRAM_MAIN()

From b64e65bec3ade1462101faa15451bead0de6c17f Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 15:42:38 -0400
Subject: [PATCH 11/18] revert generate_graph_from_edgelist changes, not
 necessary with new test

---
 cpp/src/community/louvain.cu                  |  79 ++++++-
 cpp/tests/community/mg_louvain_test.cpp       |   2 +-
 .../utilities/generate_graph_from_edgelist.cu | 210 +++++-------------
 3 files changed, 123 insertions(+), 168 deletions(-)

diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index ab27923cdb1..a3a2ffa39eb 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -82,11 +82,8 @@ void flatten_dendrogram(raft::handle_t const &handle,
                    vertex_ids_v.end(),
                    vertex_t{0});
 
-  partition_at_level<vertex_t, false>(handle,
-                                      dendrogram,
-                                      vertex_ids_v.data(),
-                                      clustering,
-                                      dendrogram.num_levels());
+  partition_at_level<vertex_t, false>(
+    handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels());
 }
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
@@ -104,11 +101,8 @@ void flatten_dendrogram(
                    vertex_ids_v.end(),
                    graph_view.get_local_vertex_first());
 
-  partition_at_level<vertex_t, multi_gpu>(handle,
-                                          dendrogram,
-                                          vertex_ids_v.data(),
-                                          clustering,
-                                          dendrogram.num_levels());
+  partition_at_level<vertex_t, multi_gpu>(
+    handle, dendrogram, vertex_ids_v.data(), clustering, dendrogram.num_levels());
 }
 
 }  // namespace detail
@@ -127,8 +121,8 @@ template <typename graph_t>
 void flatten_dendrogram(raft::handle_t const &handle,
                         graph_t const &graph_view,
                         Dendrogram<typename graph_t::vertex_type> const &dendrogram,
-                        typename graph_t::vertex_type *clustering) {
-
+                        typename graph_t::vertex_type *clustering)
+{
   detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
 }
 
@@ -155,6 +149,67 @@ std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &h
 }
 
 // Explicit template instantations
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, float, false, false> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, float, false, false> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, float, false, false> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, double, false, false> const &,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, double, false, false> const &,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, double, false, false> const &,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, float, false, true> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, float, false, true> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, float> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, float, false, true> const &,
+  size_t,
+  float);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int32_t, double, false, true> const &,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int32_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int32_t, int64_t, double, false, true> const &,
+  size_t,
+  double);
+template std::pair<std::unique_ptr<Dendrogram<int64_t>>, double> louvain(
+  raft::handle_t const &,
+  experimental::graph_view_t<int64_t, int64_t, double, false, true> const &,
+  size_t,
+  double);
+
 template std::pair<size_t, float> louvain(
   raft::handle_t const &, GraphCSRView<int32_t, int32_t, float> const &, int32_t *, size_t, float);
 template std::pair<size_t, double> louvain(raft::handle_t const &,
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index d62a0c8eae1..f6596a6b59a 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -150,7 +150,7 @@ class Louvain_MG_Testfixture : public ::testing::TestWithParam<Louvain_Usecase>
           d_clustering_v.resize(graph_view.get_number_of_vertices(), handle.get_stream());
 
           std::tie(std::ignore, sg_modularity) =
-            cugraph::louvain(handle, graph_view, d_clustering_v.data(), 1, resolution);
+            cugraph::louvain(handle, graph_view, d_clustering_v.data(), size_t{1}, resolution);
 
           EXPECT_TRUE(cugraph::test::compare_renumbered_vectors(
             handle, d_clustering_v, d_dendrogram_gathered_v));
diff --git a/cpp/tests/utilities/generate_graph_from_edgelist.cu b/cpp/tests/utilities/generate_graph_from_edgelist.cu
index fa10c31a5a4..1b9fe6051f7 100644
--- a/cpp/tests/utilities/generate_graph_from_edgelist.cu
+++ b/cpp/tests/utilities/generate_graph_from_edgelist.cu
@@ -30,47 +30,39 @@ namespace test {
 
 namespace detail {
 
-template <typename vertex_t>
-struct compute_gpu_id_from_vertex_no_renumbering {
-  int comm_size{0};
-  vertex_t num_vertices{1};
-
-  __device__ int operator()(vertex_t v) const
-  {
-    vertex_t vertices_per_gpu = static_cast<vertex_t>(num_vertices + comm_size - 1) / comm_size;
-    return static_cast<int>(v / vertices_per_gpu);
-  }
-};
-
-template <typename vertex_t>
-struct compute_gpu_id_from_edge_no_renumbering {
-  bool hypergraph_partitioned{false};
-  int comm_size{0};
-  int row_comm_size{0};
-  int col_comm_size{0};
-  vertex_t num_vertices{1};
-
-  __device__ int operator()(vertex_t major, vertex_t minor) const
-  {
-    vertex_t vertices_per_gpu = static_cast<vertex_t>(num_vertices + comm_size - 1) / comm_size;
-    auto major_comm_rank      = static_cast<int>(major / vertices_per_gpu);
-    auto minor_comm_rank      = static_cast<int>(minor / vertices_per_gpu);
-
-    if (hypergraph_partitioned) {
-      return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
-    } else {
-      return (major_comm_rank - (major_comm_rank % row_comm_size)) +
-             (minor_comm_rank / col_comm_size);
-    }
-  }
-};
-
-template <typename vertex_t, typename key_function_t>
-void filter_vertices(raft::handle_t const& handle,
-                     rmm::device_uvector<vertex_t>& vertices,
-                     key_function_t vertex_key_func,
-                     int comm_rank)
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool store_transposed,
+          bool multi_gpu>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+    rmm::device_uvector<vertex_t>>>
+generate_graph_from_edgelist(raft::handle_t const& handle,
+                             rmm::device_uvector<vertex_t>&& vertices,
+                             rmm::device_uvector<vertex_t>&& edgelist_rows,
+                             rmm::device_uvector<vertex_t>&& edgelist_cols,
+                             rmm::device_uvector<weight_t>&& edgelist_weights,
+                             bool is_symmetric,
+                             bool test_weighted,
+                             bool renumber)
 {
+  CUGRAPH_EXPECTS(renumber, "renumber should be true if multi_gpu is true.");
+
+  auto& comm               = handle.get_comms();
+  auto const comm_size     = comm.get_size();
+  auto const comm_rank     = comm.get_rank();
+  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
+  auto const row_comm_size = row_comm.get_size();
+  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
+  auto const col_comm_size = col_comm.get_size();
+
+  vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
+
+  auto vertex_key_func =
+    cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size};
   vertices.resize(thrust::distance(vertices.begin(),
                                    thrust::remove_if(
                                      rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
@@ -80,17 +72,10 @@ void filter_vertices(raft::handle_t const& handle,
                                        return key_func(val) != comm_rank;
                                      })),
                   handle.get_stream());
-}
+  vertices.shrink_to_fit(handle.get_stream());
 
-template <bool store_transposed, typename vertex_t, typename weight_t, typename key_function_t>
-void filter_edges(raft::handle_t const& handle,
-                  rmm::device_uvector<vertex_t>& edgelist_rows,
-                  rmm::device_uvector<vertex_t>& edgelist_cols,
-                  rmm::device_uvector<weight_t>& edgelist_weights,
-                  key_function_t edge_key_func,
-                  int comm_rank,
-                  bool test_weighted)
-{
+  auto edge_key_func = cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
+    false, comm_size, row_comm_size, col_comm_size};
   size_t number_of_local_edges{};
   if (test_weighted) {
     auto edge_first = thrust::make_zip_iterator(
@@ -121,120 +106,35 @@ void filter_edges(raft::handle_t const& handle,
   }
 
   edgelist_rows.resize(number_of_local_edges, handle.get_stream());
-  edgelist_cols.resize(number_of_local_edges, handle.get_stream());
-  if (test_weighted) edgelist_weights.resize(number_of_local_edges, handle.get_stream());
-}
-
-template <typename vertex_t,
-          typename edge_t,
-          typename weight_t,
-          bool store_transposed,
-          bool multi_gpu>
-std::enable_if_t<
-  multi_gpu,
-  std::tuple<
-    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
-    rmm::device_uvector<vertex_t>>>
-generate_graph_from_edgelist(raft::handle_t const& handle,
-                             rmm::device_uvector<vertex_t>&& vertices,
-                             rmm::device_uvector<vertex_t>&& edgelist_rows,
-                             rmm::device_uvector<vertex_t>&& edgelist_cols,
-                             rmm::device_uvector<weight_t>&& edgelist_weights,
-                             bool is_symmetric,
-                             bool test_weighted,
-                             bool renumber)
-{
-  auto& comm               = handle.get_comms();
-  auto const comm_size     = comm.get_size();
-  auto const comm_rank     = comm.get_rank();
-  auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
-  auto const row_comm_size = row_comm.get_size();
-  auto const row_comm_rank = row_comm.get_rank();
-  auto& col_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
-  auto const col_comm_size = col_comm.get_size();
-  auto const col_comm_rank = col_comm.get_rank();
-
-  vertex_t number_of_vertices = static_cast<vertex_t>(vertices.size());
-  edge_t number_of_edges = static_cast<edge_t>(edgelist_rows.size());
-
-  if (renumber) {
-    filter_vertices(
-      handle,
-      vertices,
-      cugraph::experimental::detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
-      comm_rank);
-    filter_edges<store_transposed>(
-      handle,
-      edgelist_rows,
-      edgelist_cols,
-      edgelist_weights,
-      cugraph::experimental::detail::compute_gpu_id_from_edge_t<vertex_t>{
-        false, comm_size, row_comm_size, col_comm_size},
-      comm_rank,
-      test_weighted);
-  } else {
-    filter_vertices(
-      handle,
-      vertices,
-      compute_gpu_id_from_vertex_no_renumbering<vertex_t>{comm_size, number_of_vertices},
-      comm_rank);
-    filter_edges<store_transposed>(handle,
-                                   edgelist_rows,
-                                   edgelist_cols,
-                                   edgelist_weights,
-                                   compute_gpu_id_from_edge_no_renumbering<vertex_t>{
-                                     false, comm_size, row_comm_size, col_comm_size, number_of_vertices},
-                                   comm_rank,
-                                   test_weighted);
-  }
-
-  vertices.shrink_to_fit(handle.get_stream());
   edgelist_rows.shrink_to_fit(handle.get_stream());
+  edgelist_cols.resize(number_of_local_edges, handle.get_stream());
   edgelist_cols.shrink_to_fit(handle.get_stream());
-  if (test_weighted) edgelist_weights.shrink_to_fit(handle.get_stream());
+  if (test_weighted) {
+    edgelist_weights.resize(number_of_local_edges, handle.get_stream());
+    edgelist_weights.shrink_to_fit(handle.get_stream());
+  }
 
   // 3. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
   cugraph::experimental::partition_t<vertex_t> partition{};
   vertex_t aggregate_number_of_vertices{};
-
-  if (renumber) {
-    // FIXME: set do_expensive_check to false once validated
-    std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) =
-      cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-        handle,
-        vertices.data(),
-        static_cast<vertex_t>(vertices.size()),
-        store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
-        store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
-        edgelist_rows.size(),
-        false,
-        true);
-    assert(aggregate_number_of_vertices == number_of_vertices);
-  } else {
-    std::vector<vertex_t> vertex_partition_offsets(comm_size + 1, 0);
-    vertex_t vertices_per_gpu =
-      static_cast<vertex_t>(number_of_vertices + comm_size - 1) / comm_size;
-
-    std::for_each(thrust::make_counting_iterator(0),
-                  thrust::make_counting_iterator(comm_size),
-                  [h_vertex_partition_offsets = vertex_partition_offsets.data(), vertices_per_gpu](
-                    auto idx) { h_vertex_partition_offsets[idx] = idx * vertices_per_gpu; });
-
-    vertex_partition_offsets[comm_size] = number_of_vertices;
-
-    partition = cugraph::experimental::partition_t<vertex_t>(vertex_partition_offsets,
-                                                             false,  // is_hypergraph_partitioned,
-                                                             row_comm_size,
-                                                             col_comm_size,
-                                                             row_comm_rank,
-                                                             col_comm_rank);
-
-    //number_of_edges = edgelist_rows.size();
-  }
+  edge_t number_of_edges{};
+  // FIXME: set do_expensive_check to false once validated
+  std::tie(renumber_map_labels, partition, aggregate_number_of_vertices, number_of_edges) =
+    cugraph::experimental::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      vertices.data(),
+      static_cast<vertex_t>(vertices.size()),
+      store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
+      store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
+      edgelist_rows.size(),
+      false,
+      true);
+  assert(aggregate_number_of_vertices == number_of_vertices);
 
   // 4. create a graph
+
   return std::make_tuple(
     cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
@@ -248,7 +148,7 @@ generate_graph_from_edgelist(raft::handle_t const& handle,
       number_of_vertices,
       number_of_edges,
       cugraph::experimental::graph_properties_t{is_symmetric, false},
-      false,
+      true,
       true),
     std::move(renumber_map_labels));
 }

From bf522a4445f03ca6b2dc42a6d511ceb148b69343 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 15:51:37 -0400
Subject: [PATCH 12/18] update format and copyright information

---
 cpp/include/dendrogram.hpp                |  6 +-----
 cpp/include/graph.hpp                     | 10 +++-------
 cpp/include/utilities/device_comm.cuh     |  8 +++++---
 cpp/src/experimental/louvain.cuh          |  4 ++--
 cpp/tests/community/mg_louvain_helper.cu  | 20 ++++++++++----------
 cpp/tests/community/mg_louvain_helper.hpp |  8 +++++---
 cpp/tests/utilities/base_fixture.hpp      |  2 +-
 7 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp
index 83a35110605..230f64343ad 100644
--- a/cpp/include/dendrogram.hpp
+++ b/cpp/include/dendrogram.hpp
@@ -41,14 +41,10 @@ class Dendrogram {
 
   vertex_t const *get_level_ptr_nocheck(size_t level) const
   {
-    //return static_cast<vertex_t const *>(level_ptr_[level]->data());
     return level_ptr_[level]->data();
   }
 
-  vertex_t *get_level_ptr_nocheck(size_t level)
-  {
-    return level_ptr_[level]->data();
-  }
+  vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); }
 
   size_t get_level_size_nocheck(size_t level) const { return level_ptr_[level]->size(); }
 
diff --git a/cpp/include/graph.hpp b/cpp/include/graph.hpp
index c4ca3b40c11..8ea58546ce1 100644
--- a/cpp/include/graph.hpp
+++ b/cpp/include/graph.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,13 +69,9 @@ class GraphViewBase {
   edge_t *local_edges;
   vertex_t *local_offsets;
 
-  vertex_t get_number_of_vertices() const {
-    return number_of_vertices;
-  }
+  vertex_t get_number_of_vertices() const { return number_of_vertices; }
 
-  vertex_t get_local_vertex_first() const {
-    return vertex_t{0};
-  }
+  vertex_t get_local_vertex_first() const { return vertex_t{0}; }
 
   /**
    * @brief      Fill the identifiers array with the vertex identifiers.
diff --git a/cpp/include/utilities/device_comm.cuh b/cpp/include/utilities/device_comm.cuh
index e0bbea76068..53711f21a6c 100644
--- a/cpp/include/utilities/device_comm.cuh
+++ b/cpp/include/utilities/device_comm.cuh
@@ -891,9 +891,11 @@ device_reduce(raft::comms::comms_t const& comm,
   size_t constexpr tuple_size =
     thrust::tuple_size<typename thrust::iterator_traits<InputIterator>::value_type>::value;
 
-  detail::
-    device_reduce_tuple_iterator_element_impl<InputIterator, OutputIterator, size_t{0}, tuple_size>().run(
-      comm, input_first, output_first, count, op, root, stream);
+  detail::device_reduce_tuple_iterator_element_impl<InputIterator,
+                                                    OutputIterator,
+                                                    size_t{0},
+                                                    tuple_size>()
+    .run(comm, input_first, output_first, count, op, root, stream);
 }
 
 template <typename InputIterator, typename OutputIterator>
diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 30c9c1cfc4d..5ce43dbf3df 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -28,8 +28,8 @@
 #include <patterns/transform_reduce_v.cuh>
 #include <utilities/collect_comm.cuh>
 
-#include <thrust/transform_reduce.h>
 #include <thrust/binary_search.h>
+#include <thrust/transform_reduce.h>
 
 //#define TIMING
 
@@ -413,7 +413,7 @@ class Louvain {
                           cluster_keys_v_.begin(),
                           cluster_keys_v_.end(),
                           cluster_weights_v_.begin());
-      
+
       thrust::transform(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
                         next_cluster_v.begin(),
                         next_cluster_v.end(),
diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
index 57e030da214..a7f95e6d718 100644
--- a/cpp/tests/community/mg_louvain_helper.cu
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -85,16 +85,16 @@ bool compare_renumbered_vectors(raft::handle_t const &handle,
                      d_map[e1] = e2;
                    });
 
-  auto error_count = thrust::count_if(
-    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-    iter,
-    iter + v1.size(),
-    [d_map = map.data()] __device__(auto pair) {
-      vertex_t e1 = thrust::get<0>(pair);
-      vertex_t e2 = thrust::get<1>(pair);
-
-      return (d_map[e1] != e2);
-    });
+  auto error_count =
+    thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     iter,
+                     iter + v1.size(),
+                     [d_map = map.data()] __device__(auto pair) {
+                       vertex_t e1 = thrust::get<0>(pair);
+                       vertex_t e2 = thrust::get<1>(pair);
+
+                       return (d_map[e1] != e2);
+                     });
 
   return (error_count == 0);
 }
diff --git a/cpp/tests/community/mg_louvain_helper.hpp b/cpp/tests/community/mg_louvain_helper.hpp
index 87d46907bfc..43eb294cd13 100644
--- a/cpp/tests/community/mg_louvain_helper.hpp
+++ b/cpp/tests/community/mg_louvain_helper.hpp
@@ -43,9 +43,11 @@ void single_gpu_renumber_edgelist_given_number_map(
 
 template <typename vertex_t, typename edge_t, typename weight_t, bool store_transposed>
 std::unique_ptr<cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>
-coarsen_graph(raft::handle_t const &handle,
-              cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const &graph_view,
-              vertex_t const *labels);
+coarsen_graph(
+  raft::handle_t const &handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, store_transposed, false> const
+    &graph_view,
+  vertex_t const *labels);
 
 }  // namespace test
 }  // namespace cugraph
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 78ac57dba58..79a86e1fc95 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -162,7 +162,7 @@ inline auto parse_test_options(int argc, char **argv)
     auto resource       = cugraph::test::create_memory_resource(rmm_mode);            \
                                                                                       \
     if (comm_rank != 0) {                                                             \
-      auto& listeners = ::testing::UnitTest::GetInstance()->listeners();              \
+      auto &listeners = ::testing::UnitTest::GetInstance()->listeners();              \
       delete listeners.Release(listeners.default_result_printer());                   \
     }                                                                                 \
     rmm::mr::set_current_device_resource(resource.get());                             \

From 0810213bceca66c5b88fd80d7d835207d03da20f Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 15:56:21 -0400
Subject: [PATCH 13/18] missed a format issue

---
 cpp/include/dendrogram.hpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp
index 230f64343ad..6316bdb25fc 100644
--- a/cpp/include/dendrogram.hpp
+++ b/cpp/include/dendrogram.hpp
@@ -39,10 +39,7 @@ class Dendrogram {
 
   size_t num_levels() const { return level_ptr_.size(); }
 
-  vertex_t const *get_level_ptr_nocheck(size_t level) const
-  {
-    return level_ptr_[level]->data();
-  }
+  vertex_t const *get_level_ptr_nocheck(size_t level) const { return level_ptr_[level]->data(); }
 
   vertex_t *get_level_ptr_nocheck(size_t level) { return level_ptr_[level]->data(); }
 

From 9e753ca3f93ee5ace1cfbdfde26e9bc687c8ce2e Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 17:26:00 -0400
Subject: [PATCH 14/18] add cuco/static_map guards for a few primitives that
 didn't have them

---
 cpp/include/dendrogram.hpp                                      | 1 -
 cpp/{src => include}/experimental/include_cuco_static_map.cuh   | 0
 .../patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh | 2 +-
 .../patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh   | 2 --
 cpp/include/utilities/collect_comm.cuh                          | 2 +-
 5 files changed, 2 insertions(+), 5 deletions(-)
 rename cpp/{src => include}/experimental/include_cuco_static_map.cuh (100%)

diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp
index 6316bdb25fc..bb9ba470a52 100644
--- a/cpp/include/dendrogram.hpp
+++ b/cpp/include/dendrogram.hpp
@@ -15,7 +15,6 @@
  */
 #pragma once
 
-#include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <memory>
diff --git a/cpp/src/experimental/include_cuco_static_map.cuh b/cpp/include/experimental/include_cuco_static_map.cuh
similarity index 100%
rename from cpp/src/experimental/include_cuco_static_map.cuh
rename to cpp/include/experimental/include_cuco_static_map.cuh
diff --git a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 11cf2cb1137..19a5f67c9de 100644
--- a/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/patterns/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -27,7 +27,7 @@
 
 #include <raft/handle.hpp>
 
-#include <cuco/static_map.cuh>
+#include <experimental/include_cuco_static_map.cuh>
 
 #include <type_traits>
 
diff --git a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index 0b3588bc8c5..e621ed91ddb 100644
--- a/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/patterns/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -25,8 +25,6 @@
 
 #include <raft/handle.hpp>
 
-#include <cuco/static_map.cuh>
-
 #include <type_traits>
 
 namespace cugraph {
diff --git a/cpp/include/utilities/collect_comm.cuh b/cpp/include/utilities/collect_comm.cuh
index 5ca58ebeb17..8d2227c0f60 100644
--- a/cpp/include/utilities/collect_comm.cuh
+++ b/cpp/include/utilities/collect_comm.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <experimental/graph.hpp>
+#include <experimental/include_cuco_static_map.cuh>
 #include <utilities/dataframe_buffer.cuh>
 #include <utilities/shuffle_comm.cuh>
 
@@ -24,7 +25,6 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>
-#include <cuco/static_map.cuh>
 
 #include <iterator>
 #include <memory>

From d2d1dfe541e26dee9243484828ba276fa7b17fad Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 17:58:46 -0400
Subject: [PATCH 15/18] add guard for cuco static map usage in
 collect_values_for_keys

---
 cpp/src/experimental/louvain.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/experimental/louvain.cuh b/cpp/src/experimental/louvain.cuh
index 5ce43dbf3df..3136515faa6 100644
--- a/cpp/src/experimental/louvain.cuh
+++ b/cpp/src/experimental/louvain.cuh
@@ -362,6 +362,7 @@ class Louvain {
                                   rmm::device_uvector<vertex_t> &next_cluster_v,
                                   bool up_down)
   {
+#ifdef CUCO_STATIC_MAP_DEFINED
     rmm::device_uvector<weight_t> old_cluster_sum_v(
       current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
     rmm::device_uvector<weight_t> cluster_subtract_v(
@@ -500,6 +501,7 @@ class Louvain {
         d_src_cluster_cache_,
         [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; },
         weight_t{0});
+#endif
   }
 
   void shrink_graph()

From 4a266868c88a29c5d6f43045d0a5ad195e4d48b1 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Fri, 26 Mar 2021 18:48:28 -0400
Subject: [PATCH 16/18] update louvain parameters and documentation to
 reference a graph_view

---
 cpp/include/algorithms.hpp   | 39 ++++++++++++++++++------------------
 cpp/src/community/louvain.cu | 36 ++++++++++++++++-----------------
 2 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index 3c89c6a2dd3..b8706d81e21 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -616,7 +616,7 @@ weight_t hungarian(raft::handle_t const &handle,
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam     graph_t               Type of graph
+ * @tparam     graph_view_t          Type of graph
  *
  * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph                 input graph object (CSR)
@@ -633,13 +633,13 @@ weight_t hungarian(raft::handle_t const &handle,
  *                                     2) modularity of the returned clustering
  *
  */
-template <typename graph_t>
-std::pair<size_t, typename graph_t::weight_type> louvain(
+template <typename graph_view_t>
+std::pair<size_t, typename graph_view_t::weight_type> louvain(
   raft::handle_t const &handle,
-  graph_t const &graph,
-  typename graph_t::vertex_type *clustering,
-  size_t max_level                         = 100,
-  typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
+  graph_view_t const &graph_view,
+  typename graph_view_t::vertex_type *clustering,
+  size_t max_level                              = 100,
+  typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
 
 /**
  * @brief      Louvain implementation, returning dendrogram
@@ -654,10 +654,10 @@ std::pair<size_t, typename graph_t::weight_type> louvain(
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam     graph_t               Type of graph
+ * @tparam     graph_view_t          Type of graph
  *
  * @param[in]  handle                Library handle (RAFT)
- * @param[in]  graph                 Input graph object (CSR)
+ * @param[in]  graph_view            Input graph view object (CSR)
  * @param[in]  max_level             (optional) maximum number of levels to run (default 100)
  * @param[in]  resolution            (optional) The value of the resolution parameter to use.
  *                                   Called gamma in the modularity formula, this changes the size
@@ -670,12 +670,13 @@ std::pair<size_t, typename graph_t::weight_type> louvain(
  *                                     2) modularity of the returned clustering
  *
  */
-template <typename graph_t>
-std::pair<std::unique_ptr<Dendrogram<typename graph_t::vertex_type>>, typename graph_t::weight_type>
+template <typename graph_view_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>,
+          typename graph_view_t::weight_type>
 louvain(raft::handle_t const &handle,
-        graph_t const &graph,
-        size_t max_level                         = 100,
-        typename graph_t::weight_type resolution = typename graph_t::weight_type{1});
+        graph_view_t const &graph_view,
+        size_t max_level                              = 100,
+        typename graph_view_t::weight_type resolution = typename graph_view_t::weight_type{1});
 
 /**
  * @brief      Flatten a Dendrogram at a particular level
@@ -687,7 +688,7 @@ louvain(raft::handle_t const &handle,
  *
  * @throws     cugraph::logic_error when an error occurs.
  *
- * @tparam     graph_t               Type of graph
+ * @tparam     graph_view_t          Type of graph
  *
  * @param[in]  handle                Library handle (RAFT). If a communicator is set in the handle,
  * @param[in]  graph                 input graph object
@@ -695,11 +696,11 @@ louvain(raft::handle_t const &handle,
  * @param[out] clustering            Pointer to device array where the clustering should be stored
  *
  */
-template <typename graph_t>
+template <typename graph_view_t>
 void flatten_dendrogram(raft::handle_t const &handle,
-                        graph_t const &graph_view,
-                        Dendrogram<typename graph_t::vertex_type> const &dendrogram,
-                        typename graph_t::vertex_type *clustering);
+                        graph_view_t const &graph_view,
+                        Dendrogram<typename graph_view_t::vertex_type> const &dendrogram,
+                        typename graph_view_t::vertex_type *clustering);
 
 /**
  * @brief      Leiden implementation
diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index a3a2ffa39eb..25b11e9028a 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -107,43 +107,43 @@ void flatten_dendrogram(
 
 }  // namespace detail
 
-template <typename graph_t>
-std::pair<std::unique_ptr<Dendrogram<typename graph_t::vertex_type>>, typename graph_t::weight_type>
+template <typename graph_view_t>
+std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>, typename graph_view_t::weight_type>
 louvain(raft::handle_t const &handle,
-        graph_t const &graph,
+        graph_view_t const &graph_view,
         size_t max_level,
-        typename graph_t::weight_type resolution)
+        typename graph_view_t::weight_type resolution)
 {
-  return detail::louvain(handle, graph, max_level, resolution);
+  return detail::louvain(handle, graph_view, max_level, resolution);
 }
 
-template <typename graph_t>
+template <typename graph_view_t>
 void flatten_dendrogram(raft::handle_t const &handle,
-                        graph_t const &graph_view,
-                        Dendrogram<typename graph_t::vertex_type> const &dendrogram,
-                        typename graph_t::vertex_type *clustering)
+                        graph_view_t const &graph_view,
+                        Dendrogram<typename graph_view_t::vertex_type> const &dendrogram,
+                        typename graph_view_t::vertex_type *clustering)
 {
   detail::flatten_dendrogram(handle, graph_view, dendrogram, clustering);
 }
 
-template <typename graph_t>
-std::pair<size_t, typename graph_t::weight_type> louvain(raft::handle_t const &handle,
-                                                         graph_t const &graph,
-                                                         typename graph_t::vertex_type *clustering,
+template <typename graph_view_t>
+std::pair<size_t, typename graph_view_t::weight_type> louvain(raft::handle_t const &handle,
+                                                         graph_view_t const &graph_view,
+                                                         typename graph_view_t::vertex_type *clustering,
                                                          size_t max_level,
-                                                         typename graph_t::weight_type resolution)
+                                                         typename graph_view_t::weight_type resolution)
 {
-  using vertex_t = typename graph_t::vertex_type;
-  using weight_t = typename graph_t::weight_type;
+  using vertex_t = typename graph_view_t::vertex_type;
+  using weight_t = typename graph_view_t::weight_type;
 
   CUGRAPH_EXPECTS(clustering != nullptr, "Invalid input argument: clustering is null");
 
   std::unique_ptr<Dendrogram<vertex_t>> dendrogram;
   weight_t modularity;
 
-  std::tie(dendrogram, modularity) = louvain(handle, graph, max_level, resolution);
+  std::tie(dendrogram, modularity) = louvain(handle, graph_view, max_level, resolution);
 
-  flatten_dendrogram(handle, graph, *dendrogram, clustering);
+  flatten_dendrogram(handle, graph_view, *dendrogram, clustering);
 
   return std::make_pair(dendrogram->num_levels(), modularity);
 }

From 38dc0e5a73ee8aab8c204b0eb7a6c26ba40e24bc Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Mon, 29 Mar 2021 09:40:21 -0400
Subject: [PATCH 17/18] fix clang formatting issue

---
 cpp/src/community/louvain.cu | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/cpp/src/community/louvain.cu b/cpp/src/community/louvain.cu
index 25b11e9028a..2affcf29805 100644
--- a/cpp/src/community/louvain.cu
+++ b/cpp/src/community/louvain.cu
@@ -108,7 +108,8 @@ void flatten_dendrogram(
 }  // namespace detail
 
 template <typename graph_view_t>
-std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>, typename graph_view_t::weight_type>
+std::pair<std::unique_ptr<Dendrogram<typename graph_view_t::vertex_type>>,
+          typename graph_view_t::weight_type>
 louvain(raft::handle_t const &handle,
         graph_view_t const &graph_view,
         size_t max_level,
@@ -127,11 +128,12 @@ void flatten_dendrogram(raft::handle_t const &handle,
 }
 
 template <typename graph_view_t>
-std::pair<size_t, typename graph_view_t::weight_type> louvain(raft::handle_t const &handle,
-                                                         graph_view_t const &graph_view,
-                                                         typename graph_view_t::vertex_type *clustering,
-                                                         size_t max_level,
-                                                         typename graph_view_t::weight_type resolution)
+std::pair<size_t, typename graph_view_t::weight_type> louvain(
+  raft::handle_t const &handle,
+  graph_view_t const &graph_view,
+  typename graph_view_t::vertex_type *clustering,
+  size_t max_level,
+  typename graph_view_t::weight_type resolution)
 {
   using vertex_t = typename graph_view_t::vertex_type;
   using weight_t = typename graph_view_t::weight_type;

From a8c91c20b10742cdba922ea9a410f9e87cf81222 Mon Sep 17 00:00:00 2001
From: Chuck Hastings <charlesh@nvidia.com>
Date: Mon, 29 Mar 2021 13:09:34 -0400
Subject: [PATCH 18/18] legacy louvain path now uses cuco also

---
 cpp/tests/community/louvain_test.cpp | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/community/louvain_test.cpp b/cpp/tests/community/louvain_test.cpp
index ab92ce411b9..2ebf9a85902 100644
--- a/cpp/tests/community/louvain_test.cpp
+++ b/cpp/tests/community/louvain_test.cpp
@@ -74,10 +74,26 @@ class Tests_Louvain : public ::testing::TestWithParam<Louvain_Usecase> {
       directed, configuration.graph_file_full_path_);
     auto graph_view = graph->view();
 
-    louvain(graph_view,
-            graph_view.get_number_of_vertices(),
-            configuration.expected_level_,
-            configuration.expected_modularity_);
+    // "FIXME": remove this check once we drop support for Pascal
+    //
+    // Calling louvain on Pascal will throw an exception, we'll check that
+    // this is the behavior while we still support Pascal (device_prop.major < 7)
+    //
+    cudaDeviceProp device_prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&device_prop, 0));
+
+    if (device_prop.major < 7) {
+      EXPECT_THROW(louvain(graph_view,
+                           graph_view.get_number_of_vertices(),
+                           configuration.expected_level_,
+                           configuration.expected_modularity_),
+                   cugraph::logic_error);
+    } else {
+      louvain(graph_view,
+              graph_view.get_number_of_vertices(),
+              configuration.expected_level_,
+              configuration.expected_modularity_);
+    }
   }
 
   template <typename vertex_t, typename edge_t, typename weight_t, typename result_t>