From 723688fb1fcc0ff88df8850899e4e3168b78131e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 19 Aug 2021 11:35:12 -0400
Subject: [PATCH 01/57] delete unused file

---
 .../cugraph/vertex_partition_device.cuh       | 110 ------------------
 1 file changed, 110 deletions(-)
 delete mode 100644 cpp/include/cugraph/vertex_partition_device.cuh
diff --git a/cpp/include/cugraph/vertex_partition_device.cuh b/cpp/include/cugraph/vertex_partition_device.cuh
deleted file mode 100644
index 9a5bbf4bbcf..00000000000
--- a/cpp/include/cugraph/vertex_partition_device.cuh
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cugraph/graph_view.hpp>
-#include <cugraph/utilities/error.hpp>
-
-#include <type_traits>
-
-namespace cugraph {
-
-template <typename vertex_t>
-class vertex_partition_device_base_t {
- public:
-  vertex_partition_device_base_t(vertex_t number_of_vertices)
-    : number_of_vertices_(number_of_vertices)
-  {
-  }
-
-  template <typename vertex_type = vertex_t>
-  __host__ __device__ std::enable_if_t<std::is_signed<vertex_type>::value, bool> is_valid_vertex(
-    vertex_type v) const noexcept
-  {
-    return ((v >= 0) && (v < number_of_vertices_));
-  }
-
-  template <typename vertex_type = vertex_t>
-  __host__ __device__ std::enable_if_t<std::is_unsigned<vertex_type>::value, bool> is_valid_vertex(
-    vertex_type v) const noexcept
-  {
-    return (v < number_of_vertices_);
-  }
-
- private:
-  // should be trivially copyable to device
-  vertex_t number_of_vertices_{0};
-};
-
-template <typename GraphViewType, typename Enable = void>
-class vertex_partition_device_t;
-
-// multi-GPU version
-template <typename GraphViewType>
-class vertex_partition_device_t<GraphViewType, std::enable_if_t<GraphViewType::is_multi_gpu>>
-  : public vertex_partition_device_base_t<typename GraphViewType::vertex_type> {
- public:
-  vertex_partition_device_t(GraphViewType const& graph_view)
-    : vertex_partition_device_base_t<typename GraphViewType::vertex_type>(
-        graph_view.get_number_of_vertices()),
-      first_(graph_view.get_local_vertex_first()),
-      last_(graph_view.get_local_vertex_last())
-  {
-  }
-
-  __host__ __device__ bool is_local_vertex_nocheck(
-    typename GraphViewType::vertex_type v) const noexcept
-  {
-    return (v >= first_) && (v < last_);
-  }
-
-  __host__ __device__ typename GraphViewType::vertex_type
-  get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept
-  {
-    return v - first_;
-  }
-
- private:
-  // should be trivially copyable to device
-  typename GraphViewType::vertex_type first_{0};
-  typename GraphViewType::vertex_type last_{0};
-};
-
-// single-GPU version
-template <typename GraphViewType>
-class vertex_partition_device_t<GraphViewType, std::enable_if_t<!GraphViewType::is_multi_gpu>>
-  : public vertex_partition_device_base_t<typename GraphViewType::vertex_type> {
- public:
-  vertex_partition_device_t(GraphViewType const& graph_view)
-    : vertex_partition_device_base_t<typename GraphViewType::vertex_type>(
-        graph_view.get_number_of_vertices())
-  {
-  }
-
-  __host__ __device__ constexpr bool is_local_vertex_nocheck(
-    typename GraphViewType::vertex_type v) const noexcept
-  {
-    return true;
-  }
-
-  __host__ __device__ constexpr typename GraphViewType::vertex_type
-  get_local_vertex_offset_from_vertex_nocheck(typename GraphViewType::vertex_type v) const noexcept
-  {
-    return v;
-  }
-};
-
-}  // namespace cugraph

From aaf7bb33aa018353a161c6e2cd76d94bcdf4e711 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 25 Aug 2021 22:38:39 -0400
Subject: [PATCH 02/57] update headers to support row/col input properties
 wrapper

---
 cpp/include/cugraph/graph_view.hpp            |  52 ++++
 .../prims/copy_to_adj_matrix_row_col.cuh      | 164 +++++------
 .../cugraph/prims/row_col_properties.cuh      | 258 ++++++++++++++++++
 3 files changed, 383 insertions(+), 91 deletions(-)
 create mode 100644 cpp/include/cugraph/prims/row_col_properties.cuh

diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 3d22828731e..3cab3b7ff8f 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -554,6 +554,26 @@ class graph_view_t<vertex_t,
   weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
   weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
 
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_row_begin() const
+  {
+    return local_sorted_unique_edge_row_first_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_row_end() const
+  {
+    return local_sorted_unique_edge_row_last_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_col_begin() const
+  {
+    return local_sorted_unique_edge_col_first_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_col_end() const
+  {
+    return local_sorted_unique_edge_col_last_;
+  }
+
  private:
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
@@ -569,6 +589,12 @@ class graph_view_t<vertex_t,
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets_{};
+
+  // FIXME: to be implemented.
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{std::nullopt};
 };
 
 // single-GPU version
@@ -748,6 +774,26 @@ class graph_view_t<vertex_t,
   weight_t compute_max_in_weight_sum(raft::handle_t const& handle) const;
   weight_t compute_max_out_weight_sum(raft::handle_t const& handle) const;
 
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_row_begin() const
+  {
+    return local_sorted_unique_edge_row_first_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_row_end() const
+  {
+    return local_sorted_unique_edge_row_last_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_col_begin() const
+  {
+    return local_sorted_unique_edge_col_first_;
+  }
+
+  std::optional<vertex_t const*> get_local_sorted_unique_edge_col_end() const
+  {
+    return local_sorted_unique_edge_col_last_;
+  }
+
  private:
   edge_t const* offsets_{nullptr};
   vertex_t const* indices_{nullptr};
@@ -755,6 +801,12 @@ class graph_view_t<vertex_t,
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
+
+  // FIXME: to be implemented.
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{std::nullopt};
 };
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 96aefa016fa..bd0d4a7e3dd 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -18,6 +18,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/matrix_partition_device_view.cuh>
 #include <cugraph/partition_manager.hpp>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/utilities/dataframe_buffer.cuh>
 #include <cugraph/utilities/device_comm.cuh>
 #include <cugraph/utilities/error.hpp>
@@ -44,11 +45,11 @@ namespace detail {
 
 template <typename GraphViewType,
           typename VertexValueInputIterator,
-          typename MatrixMajorValueOutputIterator>
+          typename MatrixMajorValueOutputWrapper>
 void copy_to_matrix_major(raft::handle_t const& handle,
                           GraphViewType const& graph_view,
                           VertexValueInputIterator vertex_value_input_first,
-                          MatrixMajorValueOutputIterator matrix_major_value_output_first)
+                          MatrixMajorValueOutputWrapper& matrix_major_value_output)
 {
   if (GraphViewType::is_multi_gpu) {
     auto& comm               = handle.get_comms();
@@ -79,7 +80,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     }
     device_allgatherv(col_comm,
                       vertex_value_input_first,
-                      matrix_major_value_output_first,
+                      matrix_major_value_output.value_data(),
                       rx_counts,
                       displacements,
                       handle.get_stream());
@@ -101,20 +102,20 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                  vertex_value_input_first,
                  vertex_value_input_first + graph_view.get_number_of_local_vertices(),
-                 matrix_major_value_output_first);
+                 matrix_major_value_output.value_data());
   }
 }
 
 template <typename GraphViewType,
           typename VertexIterator,
           typename VertexValueInputIterator,
-          typename MatrixMajorValueOutputIterator>
+          typename MatrixMajorValueOutputWrapper>
 void copy_to_matrix_major(raft::handle_t const& handle,
                           GraphViewType const& graph_view,
                           VertexIterator vertex_first,
                           VertexIterator vertex_last,
                           VertexValueInputIterator vertex_value_input_first,
-                          MatrixMajorValueOutputIterator matrix_major_value_output_first)
+                          MatrixMajorValueOutputWrapper& matrix_major_value_output)
 {
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
@@ -194,7 +195,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
           rx_value_first,
           rx_value_first + rx_counts[i],
           map_first,
-          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
+          matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset());
       } else {
         auto map_first = thrust::make_transform_iterator(
           rx_vertices.begin(), [matrix_partition] __device__(auto v) {
@@ -207,7 +208,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
           rx_value_first,
           rx_value_first + rx_counts[i],
           map_first,
-          matrix_major_value_output_first + matrix_partition.get_major_value_start_offset());
+          matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset());
       }
     }
 
@@ -230,17 +231,17 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                     val_first,
                     val_first + thrust::distance(vertex_first, vertex_last),
                     vertex_first,
-                    matrix_major_value_output_first);
+                    matrix_major_value_output.value_data());
   }
 }
 
 template <typename GraphViewType,
           typename VertexValueInputIterator,
-          typename MatrixMinorValueOutputIterator>
+          typename MatrixMinorValueOutputWrapper>
 void copy_to_matrix_minor(raft::handle_t const& handle,
                           GraphViewType const& graph_view,
                           VertexValueInputIterator vertex_value_input_first,
-                          MatrixMinorValueOutputIterator matrix_minor_value_output_first)
+                          MatrixMinorValueOutputWrapper& matrix_minor_value_output)
 {
   if (GraphViewType::is_multi_gpu) {
     auto& comm               = handle.get_comms();
@@ -271,7 +272,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     }
     device_allgatherv(row_comm,
                       vertex_value_input_first,
-                      matrix_minor_value_output_first,
+                      matrix_minor_value_output.value_data(),
                       rx_counts,
                       displacements,
                       handle.get_stream());
@@ -293,20 +294,20 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                  vertex_value_input_first,
                  vertex_value_input_first + graph_view.get_number_of_local_vertices(),
-                 matrix_minor_value_output_first);
+                 matrix_minor_value_output.value_data());
   }
 }
 
 template <typename GraphViewType,
           typename VertexIterator,
           typename VertexValueInputIterator,
-          typename MatrixMinorValueOutputIterator>
+          typename MatrixMinorValueOutputWrapper>
 void copy_to_matrix_minor(raft::handle_t const& handle,
                           GraphViewType const& graph_view,
                           VertexIterator vertex_first,
                           VertexIterator vertex_last,
                           VertexValueInputIterator vertex_value_input_first,
-                          MatrixMinorValueOutputIterator matrix_minor_value_output_first)
+                          MatrixMinorValueOutputWrapper& matrix_minor_value_output)
 {
   using vertex_t = typename GraphViewType::vertex_type;
   using edge_t   = typename GraphViewType::edge_type;
@@ -384,7 +385,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                         rx_value_first,
                         rx_value_first + rx_counts[i],
                         map_first,
-                        matrix_minor_value_output_first);
+                        matrix_minor_value_output.value_data());
       } else {
         auto map_first = thrust::make_transform_iterator(
           rx_vertices.begin(), [matrix_partition] __device__(auto v) {
@@ -396,7 +397,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                         rx_value_first,
                         rx_value_first + rx_counts[i],
                         map_first,
-                        matrix_minor_value_output_first);
+                        matrix_minor_value_output.value_data());
       }
     }
 
@@ -418,7 +419,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                     val_first,
                     val_first + thrust::distance(vertex_first, vertex_last),
                     vertex_first,
-                    matrix_minor_value_output_first);
+                    matrix_minor_value_output.value_data());
   }
 }
 
@@ -433,33 +434,29 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row
- * output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output
- * property variables for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU).
  */
-template <typename GraphViewType,
-          typename VertexValueInputIterator,
-          typename AdjMatrixRowValueOutputIterator>
-void copy_to_adj_matrix_row(raft::handle_t const& handle,
-                            GraphViewType const& graph_view,
-                            VertexValueInputIterator vertex_value_input_first,
-                            AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first)
+template <typename GraphViewType, typename VertexValueInputIterator>
+void copy_to_adj_matrix_row(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexValueInputIterator vertex_value_input_first,
+  row_properties_t<GraphViewType,
+                   typename std::iterator_traits<VertexValueInputIterator>::value_type>&
+    adj_matrix_row_value_output)
 {
-  if (GraphViewType::is_adj_matrix_transposed) {
+  if constexpr (GraphViewType::is_adj_matrix_transposed) {
     copy_to_matrix_minor(
-      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first);
+      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
   } else {
     copy_to_matrix_major(
-      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output_first);
+      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
   }
 }
 
@@ -474,8 +471,6 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle,
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam AdjMatrixRowValueOutputIterator Type of the iterator for graph adjacency matrix row
- * output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -486,36 +481,33 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle,
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_row_value_output_first Iterator pointing to the adjacency matrix row output
- * property variables for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_output_last` (exclusive) is deduced as @p adj_matrix_row_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_rows().
+ * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU).
  */
-template <typename GraphViewType,
-          typename VertexIterator,
-          typename VertexValueInputIterator,
-          typename AdjMatrixRowValueOutputIterator>
-void copy_to_adj_matrix_row(raft::handle_t const& handle,
-                            GraphViewType const& graph_view,
-                            VertexIterator vertex_first,
-                            VertexIterator vertex_last,
-                            VertexValueInputIterator vertex_value_input_first,
-                            AdjMatrixRowValueOutputIterator adj_matrix_row_value_output_first)
+template <typename GraphViewType, typename VertexIterator, typename VertexValueInputIterator>
+void copy_to_adj_matrix_row(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator vertex_first,
+  VertexIterator vertex_last,
+  VertexValueInputIterator vertex_value_input_first,
+  row_properties_t<GraphViewType,
+                   typename std::iterator_traits<VertexValueInputIterator>::value_type>&
+    adj_matrix_row_value_output)
 {
-  if (GraphViewType::is_adj_matrix_transposed) {
+  if constexpr (GraphViewType::is_adj_matrix_transposed) {
     copy_to_matrix_minor(handle,
                          graph_view,
                          vertex_first,
                          vertex_last,
                          vertex_value_input_first,
-                         adj_matrix_row_value_output_first);
+                         adj_matrix_row_value_output);
   } else {
     copy_to_matrix_major(handle,
                          graph_view,
                          vertex_first,
                          vertex_last,
                          vertex_value_input_first,
-                         adj_matrix_row_value_output_first);
+                         adj_matrix_row_value_output);
   }
 }
 
@@ -523,38 +515,33 @@ void copy_to_adj_matrix_row(raft::handle_t const& handle,
  * @brief Copy vertex property values to the corresponding graph adjacency matrix column property
  * variables.
  *
- * This version fills the entire set of graph adjacency matrix column property values. This function
- * is inspired by thrust::copy().
+ * This version fills the entire set of graph adjacency matrix column property values.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column
- * output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output
- * property variables for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU).
  */
-template <typename GraphViewType,
-          typename VertexValueInputIterator,
-          typename AdjMatrixColValueOutputIterator>
-void copy_to_adj_matrix_col(raft::handle_t const& handle,
-                            GraphViewType const& graph_view,
-                            VertexValueInputIterator vertex_value_input_first,
-                            AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first)
+template <typename GraphViewType, typename VertexValueInputIterator>
+void copy_to_adj_matrix_col(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexValueInputIterator vertex_value_input_first,
+  col_properties_t<GraphViewType,
+                   typename std::iterator_traits<VertexValueInputIterator>::value_type>&
+    adj_matrix_col_value_output)
 {
-  if (GraphViewType::is_adj_matrix_transposed) {
+  if constexpr (GraphViewType::is_adj_matrix_transposed) {
     copy_to_matrix_major(
-      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first);
+      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
   } else {
     copy_to_matrix_minor(
-      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output_first);
+      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
   }
 }
 
@@ -564,13 +551,11 @@ void copy_to_adj_matrix_col(raft::handle_t const& handle,
  *
  * This version fills only a subset of graph adjacency matrix column property values. [@p
  * vertex_first, @p vertex_last) specifies the vertices with new values to be copied to graph
- * adjacency matrix column property variables. This function is inspired by thrust::copy().
+ * adjacency matrix column property variables.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
- * @tparam AdjMatrixColValueOutputIterator Type of the iterator for graph adjacency matrix column
- * output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
@@ -581,36 +566,33 @@ void copy_to_adj_matrix_col(raft::handle_t const& handle,
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_col_value_output_first Iterator pointing to the adjacency matrix column output
- * property variables for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU).
  */
-template <typename GraphViewType,
-          typename VertexIterator,
-          typename VertexValueInputIterator,
-          typename AdjMatrixColValueOutputIterator>
-void copy_to_adj_matrix_col(raft::handle_t const& handle,
-                            GraphViewType const& graph_view,
-                            VertexIterator vertex_first,
-                            VertexIterator vertex_last,
-                            VertexValueInputIterator vertex_value_input_first,
-                            AdjMatrixColValueOutputIterator adj_matrix_col_value_output_first)
+template <typename GraphViewType, typename VertexIterator, typename VertexValueInputIterator>
+void copy_to_adj_matrix_col(
+  raft::handle_t const& handle,
+  GraphViewType const& graph_view,
+  VertexIterator vertex_first,
+  VertexIterator vertex_last,
+  VertexValueInputIterator vertex_value_input_first,
+  col_properties_t<GraphViewType,
+                   typename std::iterator_traits<VertexValueInputIterator>::value_type>&
+    adj_matrix_col_value_output)
 {
-  if (GraphViewType::is_adj_matrix_transposed) {
+  if constexpr (GraphViewType::is_adj_matrix_transposed) {
     copy_to_matrix_major(handle,
                          graph_view,
                          vertex_first,
                          vertex_last,
                          vertex_value_input_first,
-                         adj_matrix_col_value_output_first);
+                         adj_matrix_col_value_output);
   } else {
     copy_to_matrix_minor(handle,
                          graph_view,
                          vertex_first,
                          vertex_last,
                          vertex_value_input_first,
-                         adj_matrix_col_value_output_first);
+                         adj_matrix_col_value_output);
   }
 }
 
diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
new file mode 100644
index 00000000000..b1f550cf3e3
--- /dev/null
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cugraph/utilities/dataframe_buffer.cuh>
+
+#include <raft/handle.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/optional.h>
+
+#include <optional>
+
+namespace cugraph {
+
+namespace detail {
+
+template <typename vertex_t, typename ValueIterator>
+struct key_to_value_t {
+  thrust::optional<vertex_t const*> const key_first{};
+  thrust::optional<vertex_t const*> const key_last{};
+  ValueIterator const value_first{};
+
+  __device__ typename thrust::iterator_traits<ValueIterator>::value_type operator()(
+    vertex_t offset) const
+  {
+    if (key_first) {
+      auto it = thrust::lower_bound(thrust::seq, *key_first, *key_last, offset);
+      assert((it != *key_last) && (*it == offset));
+      return *(value_first + thrust::distance(*key_first, it));
+    } else {
+      return *(value_first + offset);
+    }
+  }
+};
+
+template <typename vertex_t, typename T>
+class major_properties_t {
+ public:
+  major_properties_t() : buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) {}
+
+  major_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
+    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
+  {
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream)
+  {
+    thrust::fill(
+      rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer<T>(buffer_), value);
+  }
+
+  auto begin() const { return get_dataframe_buffer_begin<T>(buffer_); }
+
+  auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
+
+ private:
+  decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
+};
+
+template <typename vertex_t, typename T>
+class minor_properties_t {
+ public:
+  minor_properties_t()
+    : key_first_(std::nullopt),
+      key_last_(std::nullopt),
+      buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{}))
+  {
+  }
+
+  minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
+    : key_first_(std::nullopt),
+      key_last_(std::nullopt),
+      buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
+  {
+  }
+
+  minor_properties_t(raft::handle_t const& handle,
+                     vertex_t const* key_first,
+                     vertex_t const* key_last)
+    : key_first_(key_first),
+      key_last_(key_last),
+      buffer_(
+        allocate_dataframe_buffer<T>(thrust::distance(key_first, key_last), handle.get_stream()))
+  {
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream)
+  {
+    thrust::fill(
+      rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer<T>(buffer_), value);
+  }
+
+  auto begin() const
+  {
+    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    return thrust::make_transform_iterator(
+      thrust::make_counting_iterator(vertex_t{0}),
+      key_to_value_t<vertex_t, decltype(value_first)>{
+        key_first_ ? thrust::make_optional(*key_first_) : thrust::nullopt,
+        key_last_ ? thrust::make_optional(*key_last_) : thrust::nullopt,
+        value_first});
+  }
+
+  auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
+
+ private:
+  std::optional<vertex_t const*> key_first_{std::nullopt};
+  std::optional<vertex_t const*> key_last_{std::nullopt};
+
+  decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
+};
+
+}  // namespace detail
+
+template <typename GraphViewType, typename T, typename Enable = void>
+class row_properties_t;
+
+template <typename GraphViewType, typename T>
+class row_properties_t<GraphViewType,
+                       T,
+                       std::enable_if_t<GraphViewType::is_adj_matrix_transposed>> {
+ public:
+  using value_type = T;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
+  {
+    auto key_first = graph_view.get_local_sorted_unique_edge_row_begin();
+    auto key_last  = graph_view.get_local_sorted_unique_edge_row_end();
+    if (key_first) {
+      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+        handle, *key_first, *key_last);
+    } else {
+      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+        handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
+    }
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
+
+  auto begin() const { return properties_.begin(); }
+  auto value_data() { return properties_.value_data(); }
+
+ private:
+  detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+};
+
+template <typename GraphViewType, typename T>
+class row_properties_t<GraphViewType,
+                       T,
+                       std::enable_if_t<!GraphViewType::is_adj_matrix_transposed>> {
+ public:
+  using value_type = T;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
+  {
+    properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
+      handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
+
+  auto begin() const { return properties_.begin(); }
+  auto value_data() { return properties_.value_data(); }
+
+ private:
+  detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+};
+
+template <typename GraphViewType, typename T, typename Enable = void>
+class col_properties_t;
+
+template <typename GraphViewType, typename T>
+class col_properties_t<GraphViewType,
+                       T,
+                       std::enable_if_t<GraphViewType::is_adj_matrix_transposed>> {
+ public:
+  using value_type = T;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
+  {
+    properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
+      handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
+
+  auto begin() const { return properties_.begin(); }
+  auto value_data() { return properties_.value_data(); }
+
+ private:
+  detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+};
+
+template <typename GraphViewType, typename T>
+class col_properties_t<GraphViewType,
+                       T,
+                       std::enable_if_t<!GraphViewType::is_adj_matrix_transposed>> {
+ public:
+  using value_type = T;
+
+  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
+
+  col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
+  {
+    auto key_first = graph_view.get_local_sorted_unique_edge_col_begin();
+    auto key_last  = graph_view.get_local_sorted_unique_edge_col_end();
+    if (key_first) {
+      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+        handle, *key_first, *key_last);
+    } else {
+      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+        handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
+    }
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
+
+  auto begin() const { return properties_.begin(); }
+  auto value_data() { return properties_.value_data(); }
+
+ private:
+  detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+};
+
+class dummy_properties_t {
+ public:
+  using value_type = thrust::nullopt_t;
+
+  auto begin() const { return thrust::make_constant_iterator(thrust::nullopt); }
+};
+
+}  // namespace cugraph

From 06ed6c59bd2acd2de3795cc3582009846177ff48 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 25 Aug 2021 22:39:48 -0400
Subject: [PATCH 03/57] update to use the wrapper

---
 cpp/src/link_analysis/pagerank.cu  |  8 +++---
 cpp/src/structure/coarsen_graph.cu | 46 ++++++++++++++++++++----------
 cpp/src/structure/graph_view.cu    | 17 +++++------
 cpp/src/traversal/sssp.cu          | 36 +++++++----------------
 4 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
index 69d5927f629..f06fdccf481 100644
--- a/cpp/src/link_analysis/pagerank.cu
+++ b/cpp/src/link_analysis/pagerank.cu
@@ -21,6 +21,7 @@
 #include <cugraph/prims/count_if_e.cuh>
 #include <cugraph/prims/count_if_v.cuh>
 #include <cugraph/prims/reduce_v.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/transform_reduce_v.cuh>
 #include <cugraph/utilities/error.hpp>
 
@@ -188,8 +189,7 @@ void pagerank(
   // old PageRank values
   rmm::device_uvector<result_t> old_pageranks(pull_graph_view.get_number_of_local_vertices(),
                                               handle.get_stream());
-  rmm::device_uvector<result_t> adj_matrix_row_pageranks(
-    pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), handle.get_stream());
+  row_properties_t<GraphViewType, result_t> adj_matrix_row_pageranks(handle, pull_graph_view);
   size_t iter{0};
   while (true) {
     thrust::copy(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
@@ -223,7 +223,7 @@ void pagerank(
                         return pagerank / divisor;
                       });
 
-    copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks.begin());
+    copy_to_adj_matrix_row(handle, pull_graph_view, pageranks, adj_matrix_row_pageranks);
 
     auto unvarying_part = aggregate_personalization_vector_size == 0
                             ? (dangling_sum * alpha + static_cast<result_t>(1.0 - alpha)) /
@@ -234,7 +234,7 @@ void pagerank(
       handle,
       pull_graph_view,
       adj_matrix_row_pageranks.begin(),
-      thrust::make_constant_iterator(0) /* dummy */,
+      dummy_properties_t{}.begin(),
       [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
         return src_val * w * alpha;
       },
diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu
index a7abb4846bd..02c2fbae3ab 100644
--- a/cpp/src/structure/coarsen_graph.cu
+++ b/cpp/src/structure/coarsen_graph.cu
@@ -21,6 +21,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_barrier.hpp>
 
@@ -129,17 +130,27 @@ edge_t groupby_e_and_coarsen_edgelist(vertex_t* edgelist_major_vertices /* [INOU
   }
 }
 
-template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
+template <typename vertex_t,
+          typename edge_t,
+          typename weight_t,
+          bool multi_gpu,
+          typename VertexIterator0,
+          typename VertexIterator1>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>>
 decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
   raft::handle_t const& handle,
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
-  vertex_t const* p_major_labels,
-  vertex_t const* p_minor_labels,
+  VertexIterator0 const major_label_first,
+  VertexIterator1 const minor_label_first,
   std::optional<std::vector<vertex_t>> const& segment_offsets)
 {
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<VertexIterator0>::value_type, vertex_t>);
+  static_assert(
+    std::is_same_v<typename thrust::iterator_traits<VertexIterator1>::value_type, vertex_t>);
+
   // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
   // compressed sparse format to save memory
 
@@ -152,12 +163,13 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
                     pair_first,
                     pair_first + edgelist_major_vertices.size(),
                     pair_first,
-                    [p_major_labels,
-                     p_minor_labels,
+                    [major_label_first,
+                     minor_label_first,
                      major_first = matrix_partition.get_major_first(),
                      minor_first = matrix_partition.get_minor_first()] __device__(auto val) {
-                      return thrust::make_tuple(p_major_labels[thrust::get<0>(val) - major_first],
-                                                p_minor_labels[thrust::get<1>(val) - minor_first]);
+                      return thrust::make_tuple(
+                        *(major_label_first + (thrust::get<0>(val) - major_first)),
+                        *(minor_label_first + (thrust::get<1>(val) - minor_first)));
                     });
 
   auto number_of_edges = groupby_e_and_coarsen_edgelist(
@@ -212,16 +224,20 @@ coarsen_graph(
 
   // 1. construct coarsened edge list
 
-  rmm::device_uvector<vertex_t> adj_matrix_minor_labels(
-    store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_rows()
-                     : graph_view.get_number_of_local_adj_matrix_partition_cols(),
-    handle.get_stream());
-  if (store_transposed) {
-    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels.data());
+  std::conditional_t<
+    store_transposed,
+    row_properties_t<graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+                     vertex_t>,
+    col_properties_t<graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>,
+                     vertex_t>>
+    adj_matrix_minor_labels(handle, graph_view);
+  if constexpr (store_transposed) {
+    copy_to_adj_matrix_row(handle, graph_view, labels, adj_matrix_minor_labels);
   } else {
-    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels.data());
+    copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels);
   }
 
+
   std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_major_vertices{};
   std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minor_vertices{};
   auto coarsened_edgelist_weights =
@@ -291,7 +307,7 @@ coarsen_graph(
         matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
           graph_view.get_matrix_partition_view(i)),
         major_labels.data(),
-        adj_matrix_minor_labels.data(),
+        adj_matrix_minor_labels.begin(),
         graph_view.get_local_adj_matrix_partition_segment_offsets(i));
 
     // 1-2. globally shuffle
diff --git a/cpp/src/structure/graph_view.cu b/cpp/src/structure/graph_view.cu
index 088ed214a74..05de14afd19 100644
--- a/cpp/src/structure/graph_view.cu
+++ b/cpp/src/structure/graph_view.cu
@@ -18,6 +18,7 @@
 #include <cugraph/graph_view.hpp>
 #include <cugraph/partition_manager.hpp>
 #include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/host_scalar_comm.cuh>
 
@@ -92,8 +93,8 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
     copy_v_transform_reduce_out_nbr(
       handle,
       graph_view,
-      thrust::make_constant_iterator(0) /* dummy */,
-      thrust::make_constant_iterator(0) /* dummy */,
+      dummy_properties_t{}.begin(),
+      dummy_properties_t{}.begin(),
       [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; },
       edge_t{0},
       minor_degrees.data());
@@ -101,8 +102,8 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
     copy_v_transform_reduce_in_nbr(
       handle,
       graph_view,
-      thrust::make_constant_iterator(0) /* dummy */,
-      thrust::make_constant_iterator(0) /* dummy */,
+      dummy_properties_t{}.begin(),
+      dummy_properties_t{}.begin(),
       [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; },
       edge_t{0},
       minor_degrees.data());
@@ -127,8 +128,8 @@ rmm::device_uvector<weight_t> compute_weight_sums(
     copy_v_transform_reduce_in_nbr(
       handle,
       graph_view,
-      thrust::make_constant_iterator(0) /* dummy */,
-      thrust::make_constant_iterator(0) /* dummy */,
+      dummy_properties_t{}.begin(),
+      dummy_properties_t{}.begin(),
       [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; },
       weight_t{0.0},
       weight_sums.data());
@@ -136,8 +137,8 @@ rmm::device_uvector<weight_t> compute_weight_sums(
     copy_v_transform_reduce_out_nbr(
       handle,
       graph_view,
-      thrust::make_constant_iterator(0) /* dummy */,
-      thrust::make_constant_iterator(0) /* dummy */,
+      dummy_properties_t{}.begin(),
+      dummy_properties_t{}.begin(),
       [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; },
       weight_t{0.0},
       weight_sums.data());
diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu
index 8402a74181b..3eb4272cf39 100644
--- a/cpp/src/traversal/sssp.cu
+++ b/cpp/src/traversal/sssp.cu
@@ -134,22 +134,8 @@ void sssp(raft::handle_t const& handle,
 
   // 5. SSSP iteration
 
-  bool vertex_and_adj_matrix_row_ranges_coincide =
-    push_graph_view.get_number_of_local_vertices() ==
-        push_graph_view.get_number_of_local_adj_matrix_partition_rows()
-      ? true
-      : false;
-  rmm::device_uvector<weight_t> adj_matrix_row_distances(0, handle.get_stream());
-  if (!vertex_and_adj_matrix_row_ranges_coincide) {
-    adj_matrix_row_distances.resize(push_graph_view.get_number_of_local_adj_matrix_partition_rows(),
-                                    handle.get_stream());
-    thrust::fill(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
-                 adj_matrix_row_distances.begin(),
-                 adj_matrix_row_distances.end(),
-                 std::numeric_limits<weight_t>::max());
-  }
-  auto row_distances =
-    !vertex_and_adj_matrix_row_ranges_coincide ? adj_matrix_row_distances.data() : distances;
+  row_properties_t<GraphViewType, weight_t> adj_matrix_row_distances(handle, push_graph_view);
+  adj_matrix_row_distances.fill(std::numeric_limits<weight_t>::max(), handle.get_stream());
 
   if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
     vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).insert(source_vertex);
@@ -157,15 +143,13 @@ void sssp(raft::handle_t const& handle,
 
   auto near_far_threshold = delta;
   while (true) {
-    if (!vertex_and_adj_matrix_row_ranges_coincide) {
-      copy_to_adj_matrix_row(
-        handle,
-        push_graph_view,
-        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
-        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
-        distances,
-        row_distances);
-    }
+    copy_to_adj_matrix_row(
+      handle,
+      push_graph_view,
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
+      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
+      distances,
+      adj_matrix_row_distances);
 
     auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
       push_graph_view.get_vertex_partition_view());
@@ -176,7 +160,7 @@ void sssp(raft::handle_t const& handle,
       vertex_frontier,
       static_cast<size_t>(Bucket::cur_near),
       std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
-      row_distances,
+      adj_matrix_row_distances.begin(),
       thrust::make_constant_iterator(0) /* dummy */,
       [vertex_partition, distances, cutoff] __device__(
         vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {

From 32495c5a235a132bc58c5ec9df57af79eca5a431 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 07:35:27 -0700
Subject: [PATCH 04/57] fix MG Louvain test compile errors

---
 cpp/tests/community/mg_louvain_helper.cu | 16 +++++++---------
 cpp/tests/community/mg_louvain_test.cpp  |  1 +
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
index 5909ab177cd..5e084237ba2 100644
--- a/cpp/tests/community/mg_louvain_helper.cu
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -85,8 +85,7 @@ compressed_sparse_to_edgelist(edge_t const* compressed_sparse_offsets,
   // FIXME: this is highly inefficient for very high-degree vertices, for better performance, we can
   // fill high-degree vertices using one CUDA block per vertex, mid-degree vertices using one CUDA
   // warp per vertex, and low-degree vertices using one CUDA thread per block
-  auto execution_policy = handle.get_thrust_policy();
-  thrust::for_each(execution_policy,
+  thrust::for_each(rmm::exec_policy(stream),
                    thrust::make_counting_iterator(major_first),
                    thrust::make_counting_iterator(major_last),
                    [compressed_sparse_offsets,
@@ -96,12 +95,12 @@ compressed_sparse_to_edgelist(edge_t const* compressed_sparse_offsets,
                      auto last  = compressed_sparse_offsets[v - major_first + 1];
                      thrust::fill(thrust::seq, p_majors + first, p_majors + last, v);
                    });
-  thrust::copy(execution_policy,
+  thrust::copy(rmm::exec_policy(stream),
                compressed_sparse_indices,
                compressed_sparse_indices + number_of_edges,
                edgelist_minor_vertices.begin());
   if (compressed_sparse_weights) {
-    thrust::copy(execution_policy,
+    thrust::copy(rmm::exec_policy(stream),
                  (*compressed_sparse_weights),
                  (*compressed_sparse_weights) + number_of_edges,
                  (*edgelist_weights).data());
@@ -124,9 +123,8 @@ void sort_and_coarsen_edgelist(
 
   size_t number_of_edges{0};
 
-  auto execution_policy = handle.get_thrust_policy();
   if (edgelist_weights) {
-    thrust::sort_by_key(execution_policy,
+    thrust::sort_by_key(rmm::exec_policy(stream),
                         pair_first,
                         pair_first + edgelist_major_vertices.size(),
                         (*edgelist_weights).begin());
@@ -137,7 +135,7 @@ void sort_and_coarsen_edgelist(
                                                               stream);
     rmm::device_uvector<weight_t> tmp_edgelist_weights(tmp_edgelist_major_vertices.size(), stream);
     auto it = thrust::reduce_by_key(
-      execution_policy,
+      rmm::exec_policy(stream),
       pair_first,
       pair_first + edgelist_major_vertices.size(),
       (*edgelist_weights).begin(),
@@ -150,9 +148,9 @@ void sort_and_coarsen_edgelist(
     edgelist_minor_vertices = std::move(tmp_edgelist_minor_vertices);
     (*edgelist_weights)     = std::move(tmp_edgelist_weights);
   } else {
-    thrust::sort(execution_policy, pair_first, pair_first + edgelist_major_vertices.size());
+    thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size());
     auto it =
-      thrust::unique(execution_policy, pair_first, pair_first + edgelist_major_vertices.size());
+      thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size());
     number_of_edges = thrust::distance(pair_first, it);
   }
 
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index 4ceacba2acd..ae75929ca0b 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -29,6 +29,7 @@
 #include <raft/comms/mpi_comms.hpp>
 #include <raft/handle.hpp>
 
+#include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
 
 #include <gtest/gtest.h>

From 9e4514cc005d53908540e941e604a2686cfd17bf Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 07:39:07 -0700
Subject: [PATCH 05/57] clang-format

---
 cpp/tests/community/mg_louvain_helper.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
index 5e084237ba2..d52d8657e2a 100644
--- a/cpp/tests/community/mg_louvain_helper.cu
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -149,8 +149,8 @@ void sort_and_coarsen_edgelist(
     (*edgelist_weights)     = std::move(tmp_edgelist_weights);
   } else {
     thrust::sort(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size());
-    auto it =
-      thrust::unique(rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size());
+    auto it = thrust::unique(
+      rmm::exec_policy(stream), pair_first, pair_first + edgelist_major_vertices.size());
     number_of_edges = thrust::distance(pair_first, it);
   }
 

From be996b3248de3347191e82279d42a6a74108407c Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:29:18 -0400
Subject: [PATCH 06/57] add thrust utility function to convert to/from
 std::tuple and to emulate thrust::tuple_cat

---
 .../cugraph/utilities/thrust_tuple_utils.cuh  | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
index a46db93f6b3..1a193c68ca9 100644
--- a/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
+++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.cuh
@@ -60,6 +60,20 @@ struct compute_thrust_tuple_element_sizes_impl<TupleType, I, I> {
   void compute(std::array<size_t, thrust::tuple_size<TupleType>::value>& arr) const {}
 };
 
+template <typename TupleType, std::size_t... Is>
+auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence<Is...>)
+{
+  return std::make_tuple(thrust::get<Is>(tup)...);
+}
+
+template <typename TupleType, std::size_t... Is>
+auto std_tuple_to_thrust_tuple(TupleType tup, std::index_sequence<Is...>)
+{
+  constexpr size_t maximum_thrust_tuple_size = 10;
+  static_assert(std::tuple_size_v<TupleType> <= maximum_thrust_tuple_size);
+  return thrust::make_tuple(std::get<Is>(tup)...);
+}
+
 template <typename T>
 __device__ std::enable_if_t<std::is_arithmetic<T>::value, void> atomic_accumulate_impl(
   thrust::detail::any_assign& /* dereferencing thrust::discard_iterator results in this type */ lhs,
@@ -178,6 +192,30 @@ struct compute_thrust_tuple_element_sizes {
   }
 };
 
+template <typename TupleType>
+auto thrust_tuple_to_std_tuple(TupleType tup)
+{
+  return detail::thrust_tuple_to_std_tuple(
+    tup, std::make_index_sequence<thrust::tuple_size<TupleType>::value>{});
+}
+
+template <typename TupleType>
+auto std_tuple_to_thrust_tuple(TupleType tup)
+{
+  constexpr size_t maximum_thrust_tuple_size = 10;
+  static_assert(std::tuple_size_v<TupleType> <= maximum_thrust_tuple_size);
+  return detail::std_tuple_to_thrust_tuple(
+    tup, std::make_index_sequence<std::tuple_size_v<TupleType>>{});
+}
+
+// a temporary function to emulate thrust::tuple_cat (not supported) using std::tuple_cat (should
+// retire once thrust::tuple is replaced with cuda::std::tuple)
+template <typename... TupleTypes>
+auto thrust_tuple_cat(TupleTypes... tups)
+{
+  return std_tuple_to_thrust_tuple(std::tuple_cat(thrust_tuple_to_std_tuple(tups)...));
+}
+
 template <typename Iterator, typename TupleType>
 struct atomic_accumulate_thrust_tuple {
   __device__ constexpr void operator()(Iterator iter, TupleType const& value) const

From 2f65f41dc25811f23cc9d993e15923a438afde66 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:29:42 -0400
Subject: [PATCH 07/57] added a wrapper class for row/col properties

---
 .../cugraph/prims/row_col_properties.cuh      | 180 +++++++++++++++---
 1 file changed, 149 insertions(+), 31 deletions(-)

diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index b1f550cf3e3..adf068d2eb5 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cugraph/utilities/dataframe_buffer.cuh>
+#include <cugraph/utilities/thrust_tuple_utils.cuh>
 
 #include <raft/handle.hpp>
 #include <rmm/exec_policy.hpp>
@@ -35,22 +36,59 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename ValueIterator>
-struct key_to_value_t {
-  thrust::optional<vertex_t const*> const key_first{};
-  thrust::optional<vertex_t const*> const key_last{};
-  ValueIterator const value_first{};
+class major_properties_device_view_t {
+ public:
+  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  major_properties_device_view_t() = default;
+
+  major_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {}
+
+  void add_offset(vertex_t offset) { value_first_ += offset; }
 
-  __device__ typename thrust::iterator_traits<ValueIterator>::value_type operator()(
-    vertex_t offset) const
+  ValueIterator value_data() const { return value_first_; }
+
+  __device__ auto get(vertex_t offset) const { return *(value_first_ + offset); }
+
+ private:
+  ValueIterator value_first_{};
+};
+
+template <typename vertex_t, typename ValueIterator>
+class minor_properties_device_view_t {
+ public:
+  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  minor_properties_device_view_t() = default;
+
+  minor_properties_device_view_t(ValueIterator value_first)
+    : key_first_(thrust::nullopt), key_last_(thrust::nullopt), value_first_(value_first)
   {
-    if (key_first) {
-      auto it = thrust::lower_bound(thrust::seq, *key_first, *key_last, offset);
-      assert((it != *key_last) && (*it == offset));
-      return *(value_first + thrust::distance(*key_first, it));
-    } else {
-      return *(value_first + offset);
+  }
+
+  minor_properties_device_view_t(vertex_t const* key_first,
+                                 vertex_t const* key_last,
+                                 ValueIterator value_first)
+    : key_first_(key_first), key_last_(key_last), value_first_(value_first)
+  {
+  }
+
+  __device__ auto& get(vertex_t offset) const
+  {
+    auto value_offset = offset;
+    if (key_first_) {
+      auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset);
+      assert((it != *key_last_) && (*it == offset));
+      value_offset = static_cast<vertex_t>(thrust::distance(*key_first_, it));
     }
+    return *(value_first_ + value_offset);
   }
+
+ private:
+  thrust::optional<vertex_t const*> key_first_{thrust::nullopt};
+  thrust::optional<vertex_t const*> key_last_{thrust::nullopt};
+
+  ValueIterator value_first_{};
 };
 
 template <typename vertex_t, typename T>
@@ -65,14 +103,28 @@ class major_properties_t {
 
   void fill(T value, rmm::cuda_stream_view stream)
   {
-    thrust::fill(
-      rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer<T>(buffer_), value);
+    thrust::fill(rmm::exec_policy(stream),
+                 value_data(),
+                 value_data() + size_dataframe_buffer<T>(buffer_),
+                 value);
   }
 
-  auto begin() const { return get_dataframe_buffer_begin<T>(buffer_); }
-
   auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
 
+  auto device_view() const
+  {
+    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+  }
+
+  auto mutable_device_view()
+  {
+    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    static_assert(
+      !std::is_const_v<typename std::iterator_traits<decltype(value_first)>::value_type>);
+    return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+  }
+
  private:
   decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
 };
@@ -106,22 +158,37 @@ class minor_properties_t {
 
   void fill(T value, rmm::cuda_stream_view stream)
   {
-    thrust::fill(
-      rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer<T>(buffer_), value);
+    thrust::fill(rmm::exec_policy(stream),
+                 value_data(),
+                 value_data() + size_dataframe_buffer<T>(buffer_),
+                 value);
   }
 
-  auto begin() const
+  auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
+
+  auto device_view() const
   {
     auto value_first = get_dataframe_buffer_begin<T>(buffer_);
-    return thrust::make_transform_iterator(
-      thrust::make_counting_iterator(vertex_t{0}),
-      key_to_value_t<vertex_t, decltype(value_first)>{
-        key_first_ ? thrust::make_optional(*key_first_) : thrust::nullopt,
-        key_last_ ? thrust::make_optional(*key_last_) : thrust::nullopt,
-        value_first});
+    if (key_first_) {
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_, *key_last_, value_first);
+    } else {
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+    }
   }
 
-  auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
+  auto mutable_device_view()
+  {
+    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    static_assert(
+      !std::is_const_v<typename std::iterator_traits<decltype(value_first)>::value_type>);
+    if (key_first_) {
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_, *key_last_, value_first);
+    } else {
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+    }
+  }
 
  private:
   std::optional<vertex_t const*> key_first_{std::nullopt};
@@ -130,6 +197,22 @@ class minor_properties_t {
   decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
 };
 
+template <typename Iterator,
+          typename std::enable_if_t<std::is_arithmetic<
+            typename std::iterator_traits<Iterator>::value_type>::value>* = nullptr>
+auto to_thrust_tuple(Iterator iter)
+{
+  return thrust::make_tuple(iter);
+}
+
+template <typename Iterator,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<
+            typename std::iterator_traits<Iterator>::value_type>::value>* = nullptr>
+auto to_thrust_tuple(Iterator iter)
+{
+  return iter.get_iterator_tuple();
+}
+
 }  // namespace detail
 
 template <typename GraphViewType, typename T, typename Enable = void>
@@ -144,6 +227,8 @@ class row_properties_t<GraphViewType,
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  row_properties_t() = default;
+
   row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
     auto key_first = graph_view.get_local_sorted_unique_edge_row_begin();
@@ -159,9 +244,11 @@ class row_properties_t<GraphViewType,
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
-  auto begin() const { return properties_.begin(); }
   auto value_data() { return properties_.value_data(); }
 
+  auto device_view() const { return properties_.device_view(); }
+  auto mutable_device_view() { return properties_.mutable_device_view(); }
+
  private:
   detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
@@ -175,6 +262,8 @@ class row_properties_t<GraphViewType,
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  row_properties_t() = default;
+
   row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
     properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
@@ -183,9 +272,11 @@ class row_properties_t<GraphViewType,
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
-  auto begin() const { return properties_.begin(); }
   auto value_data() { return properties_.value_data(); }
 
+  auto device_view() const { return properties_.device_view(); }
+  auto mutable_device_view() { return properties_.mutable_device_view(); }
+
  private:
   detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
@@ -202,6 +293,8 @@ class col_properties_t<GraphViewType,
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  col_properties_t() = default;
+
   col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
     properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
@@ -210,9 +303,11 @@ class col_properties_t<GraphViewType,
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
-  auto begin() const { return properties_.begin(); }
   auto value_data() { return properties_.value_data(); }
 
+  auto device_view() const { return properties_.device_view(); }
+  auto mutable_device_view() { return properties_.mutable_device_view(); }
+
  private:
   detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
@@ -226,6 +321,8 @@ class col_properties_t<GraphViewType,
 
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
+  col_properties_t() = default;
+
   col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
     auto key_first = graph_view.get_local_sorted_unique_edge_col_begin();
@@ -241,18 +338,39 @@ class col_properties_t<GraphViewType,
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
-  auto begin() const { return properties_.begin(); }
   auto value_data() { return properties_.value_data(); }
 
+  auto device_view() const { return properties_.device_view(); }
+  auto mutable_device_view() { return properties_.mutable_device_view(); }
+
  private:
   detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
 
+template <typename vertex_t>
+class dummy_properties_device_view_t {
+ public:
+  using value_type = thrust::nullopt_t;
+
+  void add_offset(vertex_t offset) {}  // no-op
+
+  __device__ auto get(vertex_t offset) const { return thrust::nullopt; }
+};
+
+template <typename vertex_t>
 class dummy_properties_t {
  public:
   using value_type = thrust::nullopt_t;
 
-  auto begin() const { return thrust::make_constant_iterator(thrust::nullopt); }
+  auto device_view() const { return dummy_properties_device_view_t<vertex_t>{}; }
 };
 
+template <typename vertex_t, typename... Ts>
+auto device_view_concat(detail::major_properties_device_view_t<vertex_t, Ts>... device_views)
+{
+  auto concat_first = thrust::make_zip_iterator(
+    thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...));
+  return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
+}
+
 }  // namespace cugraph

From 94717d97597681d2793baa6e1bed340d73b11764 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:32:07 -0400
Subject: [PATCH 08/57] update prims to use the row/col properties wrapper

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    | 216 ++++++------
 ...ransform_reduce_key_aggregated_out_nbr.cuh | 147 +++++----
 cpp/include/cugraph/prims/count_if_e.cuh      |  48 +--
 .../cugraph/prims/property_op_utils.cuh       |  22 +-
 ...orm_reduce_by_adj_matrix_row_col_key_e.cuh | 309 +++++++++---------
 .../cugraph/prims/transform_reduce_e.cuh      | 147 +++++----
 .../update_frontier_v_push_if_out_nbr.cuh     | 131 ++++----
 7 files changed, 515 insertions(+), 505 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index 335b34828e5..353040d18e8 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -46,8 +46,8 @@ int32_t constexpr copy_v_transform_reduce_nbr_for_all_block_size = 512;
 
 template <bool update_major,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultValueOutputIterator,
           typename EdgeOp,
           typename T>
@@ -57,8 +57,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
                                  typename GraphViewType::weight_type,
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_hypersparse_first,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultValueOutputIterator result_value_output_first,
   EdgeOp e_op,
   T init /* relevent only if update_major == true */)
@@ -86,8 +86,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
     thrust::tie(indices, weights, local_degree) =
       matrix_partition.get_local_edges(static_cast<vertex_t>(major_idx));
     auto transform_op = [&matrix_partition,
-                         &adj_matrix_row_value_input_first,
-                         &adj_matrix_col_value_input_first,
+                         &adj_matrix_row_value_input,
+                         &adj_matrix_col_value_input,
                          &e_op,
                          major,
                          indices,
@@ -106,14 +106,14 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
                             : minor_offset;
       return evaluate_edge_op<GraphViewType,
                               vertex_t,
-                              AdjMatrixRowValueInputIterator,
-                              AdjMatrixColValueInputIterator,
+                              AdjMatrixRowValueInputWrapper,
+                              AdjMatrixColValueInputWrapper,
                               EdgeOp>()
         .compute(row,
                  col,
                  weight,
-                 *(adj_matrix_row_value_input_first + row_offset),
-                 *(adj_matrix_col_value_input_first + col_offset),
+                 adj_matrix_row_value_input.get(row_offset),
+                 adj_matrix_col_value_input.get(col_offset),
                  e_op);
     };
 
@@ -143,8 +143,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
 
 template <bool update_major,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultValueOutputIterator,
           typename EdgeOp,
           typename T>
@@ -155,8 +155,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultValueOutputIterator result_value_output_first,
   EdgeOp e_op,
   T init /* relevent only if update_major == true */)
@@ -178,8 +178,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
     thrust::tie(indices, weights, local_degree) =
       matrix_partition.get_local_edges(static_cast<vertex_t>(major_offset));
     auto transform_op = [&matrix_partition,
-                         &adj_matrix_row_value_input_first,
-                         &adj_matrix_col_value_input_first,
+                         &adj_matrix_row_value_input,
+                         &adj_matrix_col_value_input,
                          &e_op,
                          major_offset,
                          indices,
@@ -201,14 +201,14 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                             : minor_offset;
       return evaluate_edge_op<GraphViewType,
                               vertex_t,
-                              AdjMatrixRowValueInputIterator,
-                              AdjMatrixColValueInputIterator,
+                              AdjMatrixRowValueInputWrapper,
+                              AdjMatrixColValueInputWrapper,
                               EdgeOp>()
         .compute(row,
                  col,
                  weight,
-                 *(adj_matrix_row_value_input_first + row_offset),
-                 *(adj_matrix_col_value_input_first + col_offset),
+                 adj_matrix_row_value_input.get(row_offset),
+                 adj_matrix_col_value_input.get(col_offset),
                  e_op);
     };
 
@@ -238,8 +238,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
 
 template <bool update_major,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultValueOutputIterator,
           typename EdgeOp,
           typename T>
@@ -250,8 +250,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultValueOutputIterator result_value_output_first,
   EdgeOp e_op,
   T init /* relevent only if update_major == true */)
@@ -294,14 +294,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                             : minor_offset;
       auto e_op_result  = evaluate_edge_op<GraphViewType,
                                           vertex_t,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
+                                          AdjMatrixRowValueInputWrapper,
+                                          AdjMatrixColValueInputWrapper,
                                           EdgeOp>()
                            .compute(row,
                                     col,
                                     weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    adj_matrix_row_value_input.get(row_offset),
+                                    adj_matrix_col_value_input.get(col_offset),
                                     e_op);
       if (update_major) {
         e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
@@ -320,8 +320,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 
 template <bool update_major,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultValueOutputIterator,
           typename EdgeOp,
           typename T>
@@ -332,8 +332,8 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultValueOutputIterator result_value_output_first,
   EdgeOp e_op,
   T init /* relevent only if update_major == true */)
@@ -373,14 +373,14 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                             : minor_offset;
       auto e_op_result  = evaluate_edge_op<GraphViewType,
                                           vertex_t,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
+                                          AdjMatrixRowValueInputWrapper,
+                                          AdjMatrixColValueInputWrapper,
                                           EdgeOp>()
                            .compute(row,
                                     col,
                                     weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    adj_matrix_row_value_input.get(row_offset),
+                                    adj_matrix_col_value_input.get(col_offset),
                                     e_op);
       if (update_major) {
         e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
@@ -401,15 +401,15 @@ __global__ void for_all_major_for_all_nbr_high_degree(
 
 template <bool in,  // iterate over incoming edges (in == true) or outgoing edges (in == false)
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename T,
           typename VertexValueOutputIterator>
 void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
                                  GraphViewType const& graph_view,
-                                 AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-                                 AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                                 AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+                                 AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
                                  EdgeOp e_op,
                                  T init,
                                  VertexValueOutputIterator vertex_value_output_first)
@@ -475,12 +475,14 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
       }
     }
 
-    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? vertex_t{0}
-                                    : matrix_partition.get_major_value_start_offset();
-    auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? matrix_partition.get_major_value_start_offset()
-                                    : vertex_t{0};
+    auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+    auto matrix_partition_col_value_input = adj_matrix_col_value_input;
+    if constexpr (GraphViewType::is_adj_matrix_transposed) {
+      matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    } else {
+      matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    }
+
     std::conditional_t<
       GraphViewType::is_multi_gpu,
       std::conditional_t<update_major, decltype(major_buffer_first), decltype(minor_buffer_first)>,
@@ -506,8 +508,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_first() + (*segment_offsets)[1],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             output_buffer_first,
             e_op,
             major_init);
@@ -521,8 +523,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[1],
             matrix_partition.get_major_first() + (*segment_offsets)[2],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             output_buffer_first + (update_major ? (*segment_offsets)[1] : vertex_t{0}),
             e_op,
             major_init);
@@ -536,8 +538,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[2],
             matrix_partition.get_major_first() + (*segment_offsets)[3],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             output_buffer_first + (update_major ? (*segment_offsets)[2] : vertex_t{0}),
             e_op,
             major_init);
@@ -559,8 +561,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
               matrix_partition,
               matrix_partition.get_major_first() + (*segment_offsets)[3],
-              adj_matrix_row_value_input_first + row_value_input_offset,
-              adj_matrix_col_value_input_first + col_value_input_offset,
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
               output_buffer_first + (update_major ? (*segment_offsets)[3] : vertex_t{0}),
               e_op,
               major_init);
@@ -576,8 +578,8 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_last(),
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             output_buffer_first,
             e_op,
             major_init);
@@ -681,28 +683,29 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
  * and thrust::copy() (update vertex properties part, take transform_reduce output as copy input).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam T Type of the initial value for reduction over the incoming edges.
  * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a value to be reduced.
  * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first
  * (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
@@ -710,23 +713,23 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
  * graph_view.get_number_of_local_vertices().
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename T,
           typename VertexValueOutputIterator>
 void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
                                     GraphViewType const& graph_view,
-                                    AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-                                    AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                                    AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+                                    AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
                                     EdgeOp e_op,
                                     T init,
                                     VertexValueOutputIterator vertex_value_output_first)
 {
   detail::copy_v_transform_reduce_nbr<true>(handle,
                                             graph_view,
-                                            adj_matrix_row_value_input_first,
-                                            adj_matrix_col_value_input_first,
+                                            adj_matrix_row_value_input,
+                                            adj_matrix_col_value_input,
                                             e_op,
                                             init,
                                             vertex_value_output_first);
@@ -740,31 +743,29 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
  * input).
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam T Type of the initial value for reduction over the outgoing edges.
  * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first
- * +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p
- * adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional
- * edge weight), *(@p adj_matrix_row_value_input_first + i), and *(@p
- * adj_matrix_col_value_input_first + j) (where i is in [0,
- * graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a value to be reduced.
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
+ * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a value to be reduced.
  * @param init Initial value to be added to the reduced @p e_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
  * first (inclusive) vertex (assigned to tihs process in multi-GPU). `vertex_value_output_last`
@@ -772,24 +773,23 @@ void copy_v_transform_reduce_in_nbr(raft::handle_t const& handle,
  * graph_view.get_number_of_local_vertices().
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename T,
           typename VertexValueOutputIterator>
-void copy_v_transform_reduce_out_nbr(
-  raft::handle_t const& handle,
-  GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  EdgeOp e_op,
-  T init,
-  VertexValueOutputIterator vertex_value_output_first)
+void copy_v_transform_reduce_out_nbr(raft::handle_t const& handle,
+                                     GraphViewType const& graph_view,
+                                     AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+                                     AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+                                     EdgeOp e_op,
+                                     T init,
+                                     VertexValueOutputIterator vertex_value_output_first)
 {
   detail::copy_v_transform_reduce_nbr<false>(handle,
                                              graph_view,
-                                             adj_matrix_row_value_input_first,
-                                             adj_matrix_col_value_input_first,
+                                             adj_matrix_row_value_input,
+                                             adj_matrix_col_value_input,
                                              e_op,
                                              init,
                                              vertex_value_output_first);
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 5ae32a6f56a..db9e98310c3 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -41,14 +41,14 @@ namespace detail {
 int32_t constexpr copy_v_transform_reduce_key_aggregated_out_nbr_for_all_block_size = 1024;
 
 // a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
-template <typename VertexIterator>
+template <typename AdjMatrixColKeyInputWrapper>
 struct minor_to_key_t {
-  using vertex_t = typename std::iterator_traits<VertexIterator>::value_type;
-  VertexIterator adj_matrix_col_key_first{};
+  using vertex_t = typename AdjMatrixColKeyInputWrapper::value_type;
+  AdjMatrixColKeyInputWrapper adj_matrix_col_key_input{};
   vertex_t minor_first{};
   __device__ vertex_t operator()(vertex_t minor)
   {
-    return *(adj_matrix_col_key_first + (minor - minor_first));
+    return adj_matrix_col_key_input.get(minor - minor_first);
   }
 };
 
@@ -209,8 +209,9 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
  * support two level reduction for every vertex.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColKeyInputWrapper Type of the wrapper for graph adjacency matrix column keys.
  * @tparam VertexIterator Type of the iterator for graph adjacency matrix column key values for
  * aggregation (key type should coincide with vertex type).
  * @tparam ValueIterator Type of the iterator for values in (key, value) pairs.
@@ -221,28 +222,27 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key (for
- * aggregation) for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
- * graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param map_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
- * (assigned to this process in multi-GPU,
- * `cugraph::detail::compute_gpu_id_from_vertex_t` is used to map keys to processes).
- * (Key, value) pairs may be provided by transform_reduce_by_adj_matrix_row_key_e() or
- * transform_reduce_by_adj_matrix_col_key_e().
- * @param map_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs (assigned
- * to this process in multi-GPU).
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_key_input Device-copyable wrapper used to access column keys (for the
+ * columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view(). Use copy_to_adj_matrix_col to fill the wrapper.
+ * @param map_unique_key_first Iterator pointing to the first (inclusive) key in (key, value) pairs
+ * (assigned to this process in multi-GPU, `cugraph::detail::compute_gpu_id_from_vertex_t` is used
+ * to map keys to processes). (Key, value) pairs may be provided by
+ * transform_reduce_by_adj_matrix_row_key_e() or transform_reduce_by_adj_matrix_col_key_e().
+ * @param map_unique_key_last Iterator pointing to the last (exclusive) key in (key, value) pairs
+ * (assigned to this process in multi-GPU).
  * @param map_value_first Iterator pointing to the first (inclusive) value in (key, value) pairs
  * (assigned to this process in multi-GPU). `map_value_last` (exclusive) is deduced as @p
- * map_value_first + thrust::distance(@p map_key_first, @p map_key_last).
+ * map_value_first + thrust::distance(@p map_unique_key_first, @p map_unique_key_last).
  * @param key_aggregated_e_op Quinary operator takes edge source, key, aggregated edge weight, *(@p
  * adj_matrix_row_value_input_first + i), and value for the key stored in the input (key, value)
- * pairs provided by @p map_key_first, @p map_key_last, and @p map_value_first (aggregated over the
- * entire set of processes in multi-GPU).
+ * pairs provided by @p map_unique_key_first, @p map_unique_key_last, and @p map_value_first
+ * (aggregated over the entire set of processes in multi-GPU).
  * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
  * @param init Initial value to be added to the reduced @p reduce_op return values for each vertex.
  * @param vertex_value_output_first Iterator pointing to the vertex property variables for the
@@ -251,9 +251,9 @@ void decompress_matrix_partition_to_fill_edgelist_majors(
  * graph_view.get_number_of_local_vertices().
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename VertexIterator0,
-          typename VertexIterator1,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColKeyInputWrapper,
+          typename VertexIterator,
           typename ValueIterator,
           typename KeyAggregatedEdgeOp,
           typename ReduceOp,
@@ -262,10 +262,10 @@ template <typename GraphViewType,
 void copy_v_transform_reduce_key_aggregated_out_nbr(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  VertexIterator0 adj_matrix_col_key_first,
-  VertexIterator1 map_key_first,
-  VertexIterator1 map_key_last,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColKeyInputWrapper adj_matrix_col_key_input,
+  VertexIterator map_unique_key_first,
+  VertexIterator map_unique_key_last,
   ValueIterator map_value_first,
   KeyAggregatedEdgeOp key_aggregated_e_op,
   ReduceOp reduce_op,
@@ -274,10 +274,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
 {
   static_assert(!GraphViewType::is_adj_matrix_transposed,
                 "GraphViewType should support the push model.");
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
+  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
                              typename GraphViewType::vertex_type>::value);
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator0>::value_type,
-                             typename std::iterator_traits<VertexIterator1>::value_type>::value);
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
 
   using vertex_t = typename GraphViewType::vertex_type;
@@ -314,10 +312,10 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-    auto map_counts =
-      host_scalar_allgather(row_comm,
-                            static_cast<size_t>(thrust::distance(map_key_first, map_key_last)),
-                            handle.get_stream());
+    auto map_counts = host_scalar_allgather(
+      row_comm,
+      static_cast<size_t>(thrust::distance(map_unique_key_first, map_unique_key_last)),
+      handle.get_stream());
     std::vector<size_t> map_displacements(row_comm_size, size_t{0});
     std::partial_sum(map_counts.begin(), map_counts.end() - 1, map_displacements.begin() + 1);
     rmm::device_uvector<vertex_t> map_keys(map_displacements.back() + map_counts.back(),
@@ -326,7 +324,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       allocate_dataframe_buffer<value_t>(map_keys.size(), handle.get_stream());
     for (int i = 0; i < row_comm_size; ++i) {
       device_bcast(row_comm,
-                   map_key_first,
+                   map_unique_key_first,
                    map_keys.begin() + map_displacements[i],
                    map_counts[i],
                    i,
@@ -341,13 +339,13 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input &
     // output pointers
     thrust::copy(rmm::exec_policy(handle.get_stream()),
-                 map_key_first,
-                 map_key_last,
+                 map_unique_key_first,
+                 map_unique_key_last,
                  map_keys.begin() + map_displacements[row_comm_rank]);
     thrust::copy(
       rmm::exec_policy(handle.get_stream()),
       map_value_first,
-      map_value_first + thrust::distance(map_key_first, map_key_last),
+      map_value_first + thrust::distance(map_unique_key_first, map_unique_key_last),
       get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[row_comm_rank]);
 
     handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
@@ -357,8 +355,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     kv_map_ptr = std::make_unique<
       cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
       // cuco::static_map requires at least one empty slot
-      std::max(static_cast<size_t>(static_cast<double>(map_keys.size()) / load_factor),
-               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      std::max(
+        static_cast<size_t>(static_cast<double>(map_keys.size()) / load_factor),
+        static_cast<size_t>(thrust::distance(map_unique_key_first, map_unique_key_last)) + 1),
       invalid_vertex_id<vertex_t>::value,
       invalid_vertex_id<vertex_t>::value,
       stream_adapter);
@@ -374,15 +373,19 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     kv_map_ptr = std::make_unique<
       cuco::static_map<vertex_t, value_t, cuda::thread_scope_device, decltype(stream_adapter)>>(
       // cuco::static_map requires at least one empty slot
-      std::max(static_cast<size_t>(
-                 static_cast<double>(thrust::distance(map_key_first, map_key_last)) / load_factor),
-               static_cast<size_t>(thrust::distance(map_key_first, map_key_last)) + 1),
+      std::max(
+        static_cast<size_t>(
+          static_cast<double>(thrust::distance(map_unique_key_first, map_unique_key_last)) /
+          load_factor),
+        static_cast<size_t>(thrust::distance(map_unique_key_first, map_unique_key_last)) + 1),
       invalid_vertex_id<vertex_t>::value,
       invalid_vertex_id<vertex_t>::value,
       stream_adapter);
 
-    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(map_key_first, map_value_first));
-    kv_map_ptr->insert(pair_first, pair_first + thrust::distance(map_key_first, map_key_last));
+    auto pair_first =
+      thrust::make_zip_iterator(thrust::make_tuple(map_unique_key_first, map_value_first));
+    kv_map_ptr->insert(pair_first,
+                       pair_first + thrust::distance(map_unique_key_first, map_unique_key_last));
   }
 
   // 2. aggregate each vertex out-going edges based on keys and transform-reduce.
@@ -418,8 +421,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     if (matrix_partition.get_major_size() > 0) {
       auto minor_key_first = thrust::make_transform_iterator(
         matrix_partition.get_indices(),
-        detail::minor_to_key_t<VertexIterator0>{adj_matrix_col_key_first,
-                                                matrix_partition.get_minor_first()});
+        detail::minor_to_key_t<AdjMatrixColKeyInputWrapper>{adj_matrix_col_key_input,
+                                                            matrix_partition.get_minor_first()});
       thrust::copy(rmm::exec_policy(handle.get_stream()),
                    minor_key_first,
                    minor_key_first + matrix_partition.get_number_of_edges(),
@@ -543,28 +546,30 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       allocate_dataframe_buffer<T>(tmp_major_vertices.size(), handle.get_stream());
     auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin<T>(tmp_e_op_result_buffer);
 
+    auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+    matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+
     auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
       tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
-    thrust::transform(
-      rmm::exec_policy(handle.get_stream()),
-      triplet_first,
-      triplet_first + tmp_major_vertices.size(),
-      tmp_e_op_result_buffer_first,
-      [adj_matrix_row_value_input_first =
-         adj_matrix_row_value_input_first + matrix_partition.get_major_value_start_offset(),
-       key_aggregated_e_op,
-       matrix_partition,
-       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
-        auto major = thrust::get<0>(val);
-        auto key   = thrust::get<1>(val);
-        auto w     = thrust::get<2>(val);
-        return key_aggregated_e_op(major,
-                                   key,
-                                   w,
-                                   *(adj_matrix_row_value_input_first +
-                                     matrix_partition.get_major_offset_from_major_nocheck(major)),
-                                   kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
-      });
+    thrust::transform(rmm::exec_policy(handle.get_stream()),
+                      triplet_first,
+                      triplet_first + tmp_major_vertices.size(),
+                      tmp_e_op_result_buffer_first,
+                      [matrix_partition_row_value_input,
+                       key_aggregated_e_op,
+                       matrix_partition,
+                       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
+                        auto major = thrust::get<0>(val);
+                        auto key   = thrust::get<1>(val);
+                        auto w     = thrust::get<2>(val);
+                        return key_aggregated_e_op(
+                          major,
+                          key,
+                          w,
+                          matrix_partition_row_value_input.get(
+                            matrix_partition.get_major_offset_from_major_nocheck(major)),
+                          kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
+                      });
     tmp_minor_keys.resize(0, handle.get_stream());
     tmp_key_aggregated_edge_weights.resize(0, handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
diff --git a/cpp/include/cugraph/prims/count_if_e.cuh b/cpp/include/cugraph/prims/count_if_e.cuh
index c0f937ee9fd..a715003e7b9 100644
--- a/cpp/include/cugraph/prims/count_if_e.cuh
+++ b/cpp/include/cugraph/prims/count_if_e.cuh
@@ -31,38 +31,38 @@ namespace cugraph {
  * This function is inspired by thrust::count_if().
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns true if this edge should be
- * included in the returned count.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns true if this edge should be included in the returned count.
  * @return GraphViewType::edge_type Number of times @p e_op returned true.
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp>
 typename GraphViewType::edge_type count_if_e(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   EdgeOp e_op)
 {
   using vertex_t = typename GraphViewType::vertex_type;
@@ -70,12 +70,12 @@ typename GraphViewType::edge_type count_if_e(
 
   return transform_reduce_e(handle,
                             graph_view,
-                            adj_matrix_row_value_input_first,
-                            adj_matrix_col_value_input_first,
+                            adj_matrix_row_value_input,
+                            adj_matrix_col_value_input,
                             cast_edge_op_bool_to_integer<GraphViewType,
                                                          vertex_t,
-                                                         AdjMatrixRowValueInputIterator,
-                                                         AdjMatrixColValueInputIterator,
+                                                         AdjMatrixRowValueInputWrapper,
+                                                         AdjMatrixColValueInputWrapper,
                                                          EdgeOp,
                                                          edge_t>{e_op},
                             edge_t{0});
diff --git a/cpp/include/cugraph/prims/property_op_utils.cuh b/cpp/include/cugraph/prims/property_op_utils.cuh
index e164b14ecf2..a1e9bfd95d7 100644
--- a/cpp/include/cugraph/prims/property_op_utils.cuh
+++ b/cpp/include/cugraph/prims/property_op_utils.cuh
@@ -43,14 +43,14 @@ struct is_valid_edge_op<
 
 template <typename GraphViewType,
           typename key_t,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp>
 struct evaluate_edge_op {
   using vertex_type    = typename GraphViewType::vertex_type;
   using weight_type    = typename GraphViewType::weight_type;
-  using row_value_type = typename std::iterator_traits<AdjMatrixRowValueInputIterator>::value_type;
-  using col_value_type = typename std::iterator_traits<AdjMatrixColValueInputIterator>::value_type;
+  using row_value_type = typename AdjMatrixRowValueInputWrapper::value_type;
+  using col_value_type = typename AdjMatrixColValueInputWrapper::value_type;
 
   template <typename K = key_t,
             typename V = vertex_type,
@@ -82,16 +82,16 @@ struct evaluate_edge_op {
 
 template <typename GraphViewType,
           typename key_t,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename T>
 struct cast_edge_op_bool_to_integer {
   static_assert(std::is_integral<T>::value);
   using vertex_type    = typename GraphViewType::vertex_type;
   using weight_type    = typename GraphViewType::weight_type;
-  using row_value_type = typename std::iterator_traits<AdjMatrixRowValueInputIterator>::value_type;
-  using col_value_type = typename std::iterator_traits<AdjMatrixColValueInputIterator>::value_type;
+  using row_value_type = typename AdjMatrixRowValueInputWrapper::value_type;
+  using col_value_type = typename AdjMatrixColValueInputWrapper::value_type;
 
   EdgeOp e_op{};
 
@@ -132,10 +132,10 @@ struct property_add<thrust::tuple<Args...>>
   using Type = thrust::tuple<Args...>;
 
  private:
-  template <typename T, std::size_t... I>
-  __device__ constexpr auto sum_impl(T& t1, T& t2, std::index_sequence<I...>)
+  template <typename T, std::size_t... Is>
+  __device__ constexpr auto sum_impl(T& t1, T& t2, std::index_sequence<Is...>)
   {
-    return thrust::make_tuple((thrust::get<I>(t1) + thrust::get<I>(t2))...);
+    return thrust::make_tuple((thrust::get<Is>(t1) + thrust::get<Is>(t2))...);
   }
 
  public:
diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index f8583d71f5c..ad357900d6a 100644
--- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -36,9 +36,9 @@ int32_t constexpr transform_reduce_by_adj_matrix_row_col_key_e_for_all_block_siz
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 __device__ void update_buffer_element(
@@ -49,9 +49,9 @@ __device__ void update_buffer_element(
   typename GraphViewType::vertex_type major,
   typename GraphViewType::vertex_type minor,
   typename GraphViewType::weight_type weight,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* key,
   T* value)
@@ -65,27 +65,26 @@ __device__ void update_buffer_element(
   auto row_offset   = GraphViewType::is_adj_matrix_transposed ? minor_offset : major_offset;
   auto col_offset   = GraphViewType::is_adj_matrix_transposed ? major_offset : minor_offset;
 
-  *key   = *(adj_matrix_row_col_key_first +
-           ((GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset
-                                                                              : minor_offset));
+  *key   = adj_matrix_row_col_key_input.get((
+    (GraphViewType::is_adj_matrix_transposed != adj_matrix_row_key) ? major_offset : minor_offset));
   *value = evaluate_edge_op<GraphViewType,
                             vertex_t,
-                            AdjMatrixRowValueInputIterator,
-                            AdjMatrixColValueInputIterator,
+                            AdjMatrixRowValueInputWrapper,
+                            AdjMatrixColValueInputWrapper,
                             EdgeOp>()
              .compute(row,
                       col,
                       weight,
-                      *(adj_matrix_row_value_input_first + row_offset),
-                      *(adj_matrix_col_value_input_first + col_offset),
+                      adj_matrix_row_value_input.get(row_offset),
+                      adj_matrix_col_value_input.get(col_offset),
                       e_op);
 }
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_hypersparse(
@@ -94,9 +93,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
                                  typename GraphViewType::weight_type,
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_hypersparse_first,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   T* values)
@@ -129,9 +128,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
         major,
         indices[i],
         weights ? (*weights)[i] : weight_t{1.0},
-        adj_matrix_row_value_input_first,
-        adj_matrix_col_value_input_first,
-        adj_matrix_row_col_key_first,
+        adj_matrix_row_value_input,
+        adj_matrix_col_value_input,
+        adj_matrix_row_col_key_input,
         e_op,
         keys + local_offset + i,
         values + local_offset + i);
@@ -143,9 +142,9 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_low_degree(
@@ -155,9 +154,9 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   T* values)
@@ -186,9 +185,9 @@ __global__ void for_all_major_for_all_nbr_low_degree(
         major,
         indices[i],
         weights ? (*weights)[i] : weight_t{1.0},
-        adj_matrix_row_value_input_first,
-        adj_matrix_col_value_input_first,
-        adj_matrix_row_col_key_first,
+        adj_matrix_row_value_input,
+        adj_matrix_col_value_input,
+        adj_matrix_row_col_key_input,
         e_op,
         keys + local_offset + i,
         values + local_offset + i);
@@ -200,9 +199,9 @@ __global__ void for_all_major_for_all_nbr_low_degree(
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_mid_degree(
@@ -212,9 +211,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   T* values)
@@ -246,9 +245,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
         major,
         indices[i],
         weights ? (*weights)[i] : weight_t{1.0},
-        adj_matrix_row_value_input_first,
-        adj_matrix_col_value_input_first,
-        adj_matrix_row_col_key_first,
+        adj_matrix_row_value_input,
+        adj_matrix_col_value_input,
+        adj_matrix_row_col_key_input,
         e_op,
         keys + local_offset + i,
         values + local_offset + i);
@@ -260,9 +259,9 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 __global__ void for_all_major_for_all_nbr_high_degree(
@@ -272,9 +271,9 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   typename GraphViewType::vertex_type* keys,
   T* values)
@@ -302,9 +301,9 @@ __global__ void for_all_major_for_all_nbr_high_degree(
         major,
         indices[i],
         weights ? (*weights)[i] : weight_t{1.0},
-        adj_matrix_row_value_input_first,
-        adj_matrix_col_value_input_first,
-        adj_matrix_row_col_key_first,
+        adj_matrix_row_value_input,
+        adj_matrix_col_value_input,
+        adj_matrix_row_col_key_input,
         e_op,
         keys + local_offset + i,
         values + local_offset + i);
@@ -345,9 +344,9 @@ std::tuple<rmm::device_uvector<vertex_t>, BufferType> reduce_to_unique_kv_pairs(
 
 template <bool adj_matrix_row_key,
           typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
@@ -355,14 +354,14 @@ std::tuple<rmm::device_uvector<typename GraphViewType::vertex_type>,
 transform_reduce_by_adj_matrix_row_col_key_e(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowColKeyInputWrapper adj_matrix_row_col_key_input,
   EdgeOp e_op,
   T init)
 {
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+  static_assert(std::is_same<typename AdjMatrixRowColKeyInputWrapper::value_type,
                              typename GraphViewType::vertex_type>::value);
 
   using vertex_t = typename GraphViewType::vertex_type;
@@ -392,13 +391,23 @@ transform_reduce_by_adj_matrix_row_col_key_e(
     auto tmp_value_buffer = allocate_dataframe_buffer<T>(tmp_keys.size(), handle.get_stream());
 
     if (graph_view.get_vertex_partition_size(comm_root_rank) > 0) {
-      auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? vertex_t{0}
-                                      : matrix_partition.get_major_value_start_offset();
-      auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                      ? matrix_partition.get_major_value_start_offset()
-                                      : vertex_t{0};
-      auto segment_offsets        = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+      auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+      auto matrix_partition_col_value_input = adj_matrix_col_value_input;
+      if constexpr (GraphViewType::is_adj_matrix_transposed) {
+        matrix_partition_col_value_input.add_offset(
+          matrix_partition.get_major_value_start_offset());
+      } else {
+        matrix_partition_row_value_input.add_offset(
+          matrix_partition.get_major_value_start_offset());
+      }
+      auto matrix_partition_row_col_key_input = adj_matrix_row_col_key_input;
+      if constexpr ((adj_matrix_row_key && !GraphViewType::is_adj_matrix_transposed) ||
+                    (!adj_matrix_row_key && GraphViewType::is_adj_matrix_transposed)) {
+        matrix_partition_row_col_key_input.add_offset(
+          matrix_partition.get_major_value_start_offset());
+      }
+
+      auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
       if (segment_offsets) {
         // FIXME: we may further improve performance by 1) concurrently running kernels on different
         // segments; 2) individually tuning block sizes for different segments; and 3) adding one
@@ -414,10 +423,9 @@ transform_reduce_by_adj_matrix_row_col_key_e(
               matrix_partition,
               matrix_partition.get_major_first(),
               matrix_partition.get_major_first() + (*segment_offsets)[1],
-              adj_matrix_row_value_input_first + row_value_input_offset,
-              adj_matrix_col_value_input_first + col_value_input_offset,
-              adj_matrix_row_col_key_first +
-                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
+              matrix_partition_row_col_key_input,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin<T>(tmp_value_buffer));
@@ -432,10 +440,9 @@ transform_reduce_by_adj_matrix_row_col_key_e(
               matrix_partition,
               matrix_partition.get_major_first() + (*segment_offsets)[1],
               matrix_partition.get_major_first() + (*segment_offsets)[2],
-              adj_matrix_row_value_input_first + row_value_input_offset,
-              adj_matrix_col_value_input_first + col_value_input_offset,
-              adj_matrix_row_col_key_first +
-                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
+              matrix_partition_row_col_key_input,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin<T>(tmp_value_buffer));
@@ -450,10 +457,9 @@ transform_reduce_by_adj_matrix_row_col_key_e(
               matrix_partition,
               matrix_partition.get_major_first() + (*segment_offsets)[2],
               matrix_partition.get_major_first() + (*segment_offsets)[3],
-              adj_matrix_row_value_input_first + row_value_input_offset,
-              adj_matrix_col_value_input_first + col_value_input_offset,
-              adj_matrix_row_col_key_first +
-                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
+              matrix_partition_row_col_key_input,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin<T>(tmp_value_buffer));
@@ -468,10 +474,9 @@ transform_reduce_by_adj_matrix_row_col_key_e(
             <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
               matrix_partition,
               matrix_partition.get_major_first() + (*segment_offsets)[3],
-              adj_matrix_row_value_input_first + row_value_input_offset,
-              adj_matrix_col_value_input_first + col_value_input_offset,
-              adj_matrix_row_col_key_first +
-                (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+              matrix_partition_row_value_input,
+              matrix_partition_col_value_input,
+              matrix_partition_row_col_key_input,
               e_op,
               tmp_keys.data(),
               get_dataframe_buffer_begin<T>(tmp_value_buffer));
@@ -487,10 +492,9 @@ transform_reduce_by_adj_matrix_row_col_key_e(
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_last(),
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
-            adj_matrix_row_col_key_first +
-              (adj_matrix_row_key ? row_value_input_offset : col_value_input_offset),
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
+            matrix_partition_row_col_key_input,
             e_op,
             tmp_keys.data(),
             get_dataframe_buffer_begin<T>(tmp_value_buffer));
@@ -562,32 +566,32 @@ transform_reduce_by_adj_matrix_row_col_key_e(
  * edges are determined by the graph adjacency matrix rows.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
- * coincide with vertex type).
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
+ * @tparam AdjMatrixRowKeyInputWrapper Type of the wrapper for graph adjacency matrix row keys.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam T Type of the values in (key, value) pairs.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param adj_matrix_row_key_first Iterator pointing to the adjacency matrix row key for the first
- * (inclusive) column (assigned to this process in multi-GPU). `adj_matrix_row_key_last` (exclusive)
- * is deduced as @p adj_matrix_row_key_first + @p graph_view.get_number_of_local_adj_matrix_rows().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
+ * @param adj_matrix_row_key_input Device-copyable wrapper used to access row keys(for the rows
+ * assigned to this process in multi-GPU). Use either cugraph::row_properties_t::device_view(). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a transformed value to be reduced.
  * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
  * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
  * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
@@ -595,32 +599,31 @@ transform_reduce_by_adj_matrix_row_col_key_e(
  * type).
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixRowKeyInputWrapper,
           typename EdgeOp,
           typename T>
 auto transform_reduce_by_adj_matrix_row_key_e(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_row_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixRowKeyInputWrapper adj_matrix_row_key_input,
   EdgeOp e_op,
   T init)
 {
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+  static_assert(std::is_same<typename AdjMatrixRowKeyInputWrapper::value_type,
                              typename GraphViewType::vertex_type>::value);
 
-  return detail::transform_reduce_by_adj_matrix_row_col_key_e<true>(
-    handle,
-    graph_view,
-    adj_matrix_row_value_input_first,
-    adj_matrix_col_value_input_first,
-    adj_matrix_row_key_first,
-    e_op,
-    init);
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<true>(handle,
+                                                                    graph_view,
+                                                                    adj_matrix_row_value_input,
+                                                                    adj_matrix_col_value_input,
+                                                                    adj_matrix_row_key_input,
+                                                                    e_op,
+                                                                    init);
 }
 
 // FIXME: EdgeOp & VertexOp in update_frontier_v_push_if_out_nbr concatenates push inidicator or
@@ -632,33 +635,32 @@ auto transform_reduce_by_adj_matrix_row_key_e(
  * edges are determined by the graph adjacency matrix columns.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
- * @tparam VertexIterator Type of the iterator for keys in (key, value) pairs (key type should
- * coincide with vertex type).
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
+ * @tparam AdjMatrixColKeyInputWrapper Type of the wrapper for graph adjacency matrix column keys.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam T Type of the values in (key, value) pairs.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
- * @param adj_matrix_col_key_first Iterator pointing to the adjacency matrix column key for the
- * first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_key_last` (exclusive) is deduced as @p adj_matrix_col_key_first + @p
- * graph_view.get_number_of_local_adj_matrix_cols().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
+ * @param adj_matrix_col_key_input Device-copyable wrapper used to access column keys(for the
+ * columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view(). Use copy_to_adj_matrix_col to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a transformed value to be reduced.
  * @param init Initial value to be added to the value in each transform-reduced (key, value) pair.
  * @return std::tuple Tuple of rmm::device_uvector<typename GraphView::vertex_type> and
  * rmm::device_uvector<T> (if T is arithmetic scalar) or a tuple of rmm::device_uvector objects (if
@@ -666,32 +668,31 @@ auto transform_reduce_by_adj_matrix_row_key_e(
  * type).
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
-          typename VertexIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
+          typename AdjMatrixColKeyInputWrapper,
           typename EdgeOp,
           typename T>
 auto transform_reduce_by_adj_matrix_col_key_e(
   raft::handle_t const& handle,
   GraphViewType const& graph_view,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
-  VertexIterator adj_matrix_col_key_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
+  AdjMatrixColKeyInputWrapper adj_matrix_col_key_input,
   EdgeOp e_op,
   T init)
 {
   static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-  static_assert(std::is_same<typename std::iterator_traits<VertexIterator>::value_type,
+  static_assert(std::is_same<typename AdjMatrixColKeyInputWrapper::value_type,
                              typename GraphViewType::vertex_type>::value);
 
-  return detail::transform_reduce_by_adj_matrix_row_col_key_e<false>(
-    handle,
-    graph_view,
-    adj_matrix_row_value_input_first,
-    adj_matrix_col_value_input_first,
-    adj_matrix_col_key_first,
-    e_op,
-    init);
+  return detail::transform_reduce_by_adj_matrix_row_col_key_e<false>(handle,
+                                                                     graph_view,
+                                                                     adj_matrix_row_value_input,
+                                                                     adj_matrix_col_value_input,
+                                                                     adj_matrix_col_key_input,
+                                                                     e_op,
+                                                                     init);
 }
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh
index f46a00d37e4..f5b18e1efd6 100644
--- a/cpp/include/cugraph/prims/transform_reduce_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh
@@ -37,8 +37,8 @@ namespace detail {
 int32_t constexpr transform_reduce_e_for_all_block_size = 128;
 
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void for_all_major_for_all_nbr_hypersparse(
@@ -47,8 +47,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
                                  typename GraphViewType::weight_type,
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_hypersparse_first,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -80,8 +80,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
       thrust::make_counting_iterator(edge_t{0}),
       thrust::make_counting_iterator(local_degree),
       [&matrix_partition,
-       &adj_matrix_row_value_input_first,
-       &adj_matrix_col_value_input_first,
+       &adj_matrix_row_value_input,
+       &adj_matrix_col_value_input,
        &e_op,
        major,
        indices,
@@ -100,14 +100,14 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
                                                                  : minor_offset;
         return evaluate_edge_op<GraphViewType,
                                 vertex_t,
-                                AdjMatrixRowValueInputIterator,
-                                AdjMatrixColValueInputIterator,
+                                AdjMatrixRowValueInputWrapper,
+                                AdjMatrixColValueInputWrapper,
                                 EdgeOp>()
           .compute(row,
                    col,
                    weight,
-                   *(adj_matrix_row_value_input_first + row_offset),
-                   *(adj_matrix_col_value_input_first + col_offset),
+                   adj_matrix_row_value_input.get(row_offset),
+                   adj_matrix_col_value_input.get(col_offset),
                    e_op);
       },
       e_op_result_t{},
@@ -124,8 +124,8 @@ __global__ void for_all_major_for_all_nbr_hypersparse(
 }
 
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void for_all_major_for_all_nbr_low_degree(
@@ -135,8 +135,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -162,8 +162,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
       thrust::make_counting_iterator(edge_t{0}),
       thrust::make_counting_iterator(local_degree),
       [&matrix_partition,
-       &adj_matrix_row_value_input_first,
-       &adj_matrix_col_value_input_first,
+       &adj_matrix_row_value_input,
+       &adj_matrix_col_value_input,
        &e_op,
        major_offset,
        indices,
@@ -185,14 +185,14 @@ __global__ void for_all_major_for_all_nbr_low_degree(
                                                                  : minor_offset;
         return evaluate_edge_op<GraphViewType,
                                 vertex_t,
-                                AdjMatrixRowValueInputIterator,
-                                AdjMatrixColValueInputIterator,
+                                AdjMatrixRowValueInputWrapper,
+                                AdjMatrixColValueInputWrapper,
                                 EdgeOp>()
           .compute(row,
                    col,
                    weight,
-                   *(adj_matrix_row_value_input_first + row_offset),
-                   *(adj_matrix_col_value_input_first + col_offset),
+                   adj_matrix_row_value_input.get(row_offset),
+                   adj_matrix_col_value_input.get(col_offset),
                    e_op);
       },
       e_op_result_t{},
@@ -209,8 +209,8 @@ __global__ void for_all_major_for_all_nbr_low_degree(
 }
 
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void for_all_major_for_all_nbr_mid_degree(
@@ -220,8 +220,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -262,14 +262,14 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
                             : minor_offset;
       auto e_op_result  = evaluate_edge_op<GraphViewType,
                                           vertex_t,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
+                                          AdjMatrixRowValueInputWrapper,
+                                          AdjMatrixColValueInputWrapper,
                                           EdgeOp>()
                            .compute(row,
                                     col,
                                     weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    adj_matrix_row_value_input.get(row_offset),
+                                    adj_matrix_col_value_input.get(col_offset),
                                     e_op);
       e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
     }
@@ -283,8 +283,8 @@ __global__ void for_all_major_for_all_nbr_mid_degree(
 }
 
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename ResultIterator,
           typename EdgeOp>
 __global__ void for_all_major_for_all_nbr_high_degree(
@@ -294,8 +294,8 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   typename GraphViewType::vertex_type major_first,
   typename GraphViewType::vertex_type major_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   ResultIterator result_iter /* size 1 */,
   EdgeOp e_op)
 {
@@ -333,14 +333,14 @@ __global__ void for_all_major_for_all_nbr_high_degree(
                             : minor_offset;
       auto e_op_result  = evaluate_edge_op<GraphViewType,
                                           vertex_t,
-                                          AdjMatrixRowValueInputIterator,
-                                          AdjMatrixColValueInputIterator,
+                                          AdjMatrixRowValueInputWrapper,
+                                          AdjMatrixColValueInputWrapper,
                                           EdgeOp>()
                            .compute(row,
                                     col,
                                     weight,
-                                    *(adj_matrix_row_value_input_first + row_offset),
-                                    *(adj_matrix_col_value_input_first + col_offset),
+                                    adj_matrix_row_value_input.get(row_offset),
+                                    adj_matrix_col_value_input.get(col_offset),
                                     e_op);
       e_op_result_sum = edge_property_add(e_op_result_sum, e_op_result);
     }
@@ -361,39 +361,40 @@ __global__ void for_all_major_for_all_nbr_high_degree(
  * This function is inspired by thrust::transform_reduce().
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam T Type of the initial value.
  * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
  * handles to various CUDA libraries) to run graph algorithms.
  * @param graph_view Non-owning graph object.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a transformed value to be reduced.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a value to be reduced.
  * @param init Initial value to be added to the transform-reduced input vertex properties.
  * @return T Reduction of the @p edge_op outputs.
  */
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename T>
 T transform_reduce_e(raft::handle_t const& handle,
                      GraphViewType const& graph_view,
-                     AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-                     AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+                     AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+                     AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
                      EdgeOp e_op,
                      T init)
 {
@@ -416,13 +417,15 @@ T transform_reduce_e(raft::handle_t const& handle,
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.get_matrix_partition_view(i));
 
-    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? vertex_t{0}
-                                    : matrix_partition.get_major_value_start_offset();
-    auto col_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? matrix_partition.get_major_value_start_offset()
-                                    : vertex_t{0};
-    auto segment_offsets        = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
+    auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+    auto matrix_partition_col_value_input = adj_matrix_col_value_input;
+    if constexpr (GraphViewType::is_adj_matrix_transposed) {
+      matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    } else {
+      matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    }
+
+    auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
     if (segment_offsets) {
       // FIXME: we may further improve performance by 1) concurrently running kernels on different
       // segments; 2) individually tuning block sizes for different segments; and 3) adding one more
@@ -437,8 +440,8 @@ T transform_reduce_e(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_first() + (*segment_offsets)[1],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<T>(result_buffer),
             e_op);
       }
@@ -451,8 +454,8 @@ T transform_reduce_e(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[1],
             matrix_partition.get_major_first() + (*segment_offsets)[2],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<T>(result_buffer),
             e_op);
       }
@@ -465,8 +468,8 @@ T transform_reduce_e(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[2],
             matrix_partition.get_major_first() + (*segment_offsets)[3],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<T>(result_buffer),
             e_op);
       }
@@ -479,8 +482,8 @@ T transform_reduce_e(raft::handle_t const& handle,
           <<<update_grid.num_blocks, update_grid.block_size, 0, handle.get_stream()>>>(
             matrix_partition,
             matrix_partition.get_major_first() + (*segment_offsets)[3],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<T>(result_buffer),
             e_op);
       }
@@ -495,8 +498,8 @@ T transform_reduce_e(raft::handle_t const& handle,
             matrix_partition,
             matrix_partition.get_major_first(),
             matrix_partition.get_major_last(),
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first + col_value_input_offset,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<T>(result_buffer),
             e_op);
       }
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
index 1d04dd7fa87..97c87a477cc 100644
--- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -182,8 +182,8 @@ struct check_invalid_bucket_idx_t {
 };
 
 template <typename GraphViewType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename BufferKeyOutputIterator,
           typename BufferPayloadOutputIterator,
           typename EdgeOp>
@@ -196,8 +196,8 @@ __device__ void push_if_buffer_element(
   typename GraphViewType::vertex_type row_offset,
   typename GraphViewType::vertex_type col,
   typename GraphViewType::weight_type weight,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferPayloadOutputIterator buffer_payload_output_first,
   size_t* buffer_idx_ptr,
@@ -211,14 +211,14 @@ __device__ void push_if_buffer_element(
   auto col_offset  = matrix_partition.get_minor_offset_from_minor_nocheck(col);
   auto e_op_result = evaluate_edge_op<GraphViewType,
                                       key_t,
-                                      AdjMatrixRowValueInputIterator,
-                                      AdjMatrixColValueInputIterator,
+                                      AdjMatrixRowValueInputWrapper,
+                                      AdjMatrixColValueInputWrapper,
                                       EdgeOp>()
                        .compute(key,
                                 col,
                                 weight,
-                                *(adj_matrix_row_value_input_first + row_offset),
-                                *(adj_matrix_col_value_input_first + col_offset),
+                                adj_matrix_row_value_input.get(row_offset),
+                                adj_matrix_col_value_input.get(col_offset),
                                 e_op);
   if (e_op_result) {
     static_assert(sizeof(unsigned long long int) == sizeof(size_t));
@@ -241,8 +241,8 @@ __device__ void push_if_buffer_element(
 
 template <typename GraphViewType,
           typename KeyIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename BufferKeyOutputIterator,
           typename BufferPayloadOutputIterator,
           typename EdgeOp>
@@ -254,8 +254,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse(
   typename GraphViewType::vertex_type major_hypersparse_first,
   KeyIterator key_first,
   KeyIterator key_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferPayloadOutputIterator buffer_payload_output_first,
   size_t* buffer_idx_ptr,
@@ -303,8 +303,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse(
                                               row_offset,
                                               indices[i],
                                               weights ? (*weights)[i] : weight_t{1.0},
-                                              adj_matrix_row_value_input_first,
-                                              adj_matrix_col_value_input_first,
+                                              adj_matrix_row_value_input,
+                                              adj_matrix_col_value_input,
                                               buffer_key_output_first,
                                               buffer_payload_output_first,
                                               buffer_idx_ptr,
@@ -317,8 +317,8 @@ __global__ void for_all_frontier_row_for_all_nbr_hypersparse(
 
 template <typename GraphViewType,
           typename KeyIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename BufferKeyOutputIterator,
           typename BufferPayloadOutputIterator,
           typename EdgeOp>
@@ -329,8 +329,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   KeyIterator key_first,
   KeyIterator key_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferPayloadOutputIterator buffer_payload_output_first,
   size_t* buffer_idx_ptr,
@@ -370,8 +370,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree(
                                             row_offset,
                                             indices[i],
                                             weights ? (*weights)[i] : weight_t{1.0},
-                                            adj_matrix_row_value_input_first,
-                                            adj_matrix_col_value_input_first,
+                                            adj_matrix_row_value_input,
+                                            adj_matrix_col_value_input,
                                             buffer_key_output_first,
                                             buffer_payload_output_first,
                                             buffer_idx_ptr,
@@ -383,8 +383,8 @@ __global__ void for_all_frontier_row_for_all_nbr_low_degree(
 
 template <typename GraphViewType,
           typename KeyIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename BufferKeyOutputIterator,
           typename BufferPayloadOutputIterator,
           typename EdgeOp>
@@ -395,8 +395,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   KeyIterator key_first,
   KeyIterator key_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferPayloadOutputIterator buffer_payload_output_first,
   size_t* buffer_idx_ptr,
@@ -438,8 +438,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree(
                                             row_offset,
                                             indices[i],
                                             weights ? (*weights)[i] : weight_t{1.0},
-                                            adj_matrix_row_value_input_first,
-                                            adj_matrix_col_value_input_first,
+                                            adj_matrix_row_value_input,
+                                            adj_matrix_col_value_input,
                                             buffer_key_output_first,
                                             buffer_payload_output_first,
                                             buffer_idx_ptr,
@@ -452,8 +452,8 @@ __global__ void for_all_frontier_row_for_all_nbr_mid_degree(
 
 template <typename GraphViewType,
           typename KeyIterator,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename BufferKeyOutputIterator,
           typename BufferPayloadOutputIterator,
           typename EdgeOp>
@@ -464,8 +464,8 @@ __global__ void for_all_frontier_row_for_all_nbr_high_degree(
                                  GraphViewType::is_multi_gpu> matrix_partition,
   KeyIterator key_first,
   KeyIterator key_last,
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   BufferKeyOutputIterator buffer_key_output_first,
   BufferPayloadOutputIterator buffer_payload_output_first,
   size_t* buffer_idx_ptr,
@@ -504,8 +504,8 @@ __global__ void for_all_frontier_row_for_all_nbr_high_degree(
                                             row_offset,
                                             indices[i],
                                             weights ? (*weights)[i] : weight_t{1.0},
-                                            adj_matrix_row_value_input_first,
-                                            adj_matrix_col_value_input_first,
+                                            adj_matrix_row_value_input,
+                                            adj_matrix_col_value_input,
                                             buffer_key_output_first,
                                             buffer_payload_output_first,
                                             buffer_idx_ptr,
@@ -752,10 +752,10 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexFrontierType Type of the vertex frontier class which abstracts vertex frontier
  * managements.
- * @tparam AdjMatrixRowValueInputIterator Type of the iterator for graph adjacency matrix row
- * input properties.
- * @tparam AdjMatrixColValueInputIterator Type of the iterator for graph adjacency matrix column
- * input properties.
+ * @tparam AdjMatrixRowValueInputWrapper Type of the wrapper for graph adjacency matrix row input
+ * properties.
+ * @tparam AdjMatrixColValueInputWrapper Type of the wrapper for graph adjacency matrix column input
+ * properties.
  * @tparam EdgeOp Type of the quaternary (or quinary) edge operator.
  * @tparam ReduceOp Type of the binary reduction operator.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
@@ -770,19 +770,19 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
  * current iteration.
  * @param next_frontier_bucket_indices Indices of the VertexFrontier buckets to store new frontier
  * vertices for the next iteration.
- * @param adj_matrix_row_value_input_first Iterator pointing to the adjacency matrix row input
- * properties for the first (inclusive) row (assigned to this process in multi-GPU).
- * `adj_matrix_row_value_input_last` (exclusive) is deduced as @p adj_matrix_row_value_input_first +
- * @p graph_view.get_number_of_local_adj_matrix_partition_rows().
- * @param adj_matrix_col_value_input_first Iterator pointing to the adjacency matrix column input
- * properties for the first (inclusive) column (assigned to this process in multi-GPU).
- * `adj_matrix_col_value_output_last` (exclusive) is deduced as @p adj_matrix_col_value_output_first
- * + @p graph_view.get_number_of_local_adj_matrix_partition_cols().
+ * @param adj_matrix_row_value_input Device-copyable wrapper used to access row input properties
+ * (for the rows assigned to this process in multi-GPU). Use either
+ * cugraph::row_properties_t::device_view() (if @p e_op needs to access row properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access row properties). Use
+ * copy_to_adj_matrix_row to fill the wrapper.
+ * @param adj_matrix_col_value_input Device-copyable wrapper used to access column input properties
+ * (for the columns assigned to this process in multi-GPU). Use either
+ * cugraph::col_properties_t::device_view() (if @p e_op needs to access column properties) or
+ * cugraph::dummy_properties_t::device_view() (if @p e_op does not access column properties). Use
+ * copy_to_adj_matrix_col to fill the wrapper.
  * @param e_op Quaternary (or quinary) operator takes edge source, edge destination, (optional edge
- * weight), *(@p adj_matrix_row_value_input_first + i), and *(@p adj_matrix_col_value_input_first +
- * j) (where i is in [0, graph_view.get_number_of_local_adj_matrix_partition_rows()) and j is in [0,
- * get_number_of_local_adj_matrix_partition_cols())) and returns a value to reduced by the @p
- * reduce_op.
+ * weight), properties for the row (i.e. source), and properties for the column  (i.e. destination)
+ * and returns a value to be reduced the @p reduce_op.
  * @param reduce_op Binary operator takes two input arguments and reduce the two variables to one.
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
@@ -799,8 +799,8 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
  */
 template <typename GraphViewType,
           typename VertexFrontierType,
-          typename AdjMatrixRowValueInputIterator,
-          typename AdjMatrixColValueInputIterator,
+          typename AdjMatrixRowValueInputWrapper,
+          typename AdjMatrixColValueInputWrapper,
           typename EdgeOp,
           typename ReduceOp,
           typename VertexValueInputIterator,
@@ -815,8 +815,8 @@ void update_frontier_v_push_if_out_nbr(
   // FIXME: if vertices in the frontier are tagged, we should have an option to access with (vertex,
   // tag) pair (currently we can access only with vertex, we may use cuco::static_map for this
   // purpose)
-  AdjMatrixRowValueInputIterator adj_matrix_row_value_input_first,
-  AdjMatrixColValueInputIterator adj_matrix_col_value_input_first,
+  AdjMatrixRowValueInputWrapper adj_matrix_row_value_input,
+  AdjMatrixColValueInputWrapper adj_matrix_col_value_input,
   EdgeOp e_op,
   ReduceOp reduce_op,
   // FIXME: if vertices in the frontier are tagged, we should have an option to access with (vertex,
@@ -993,9 +993,10 @@ void update_frontier_v_push_if_out_nbr(
       resize_dataframe_buffer<payload_t>(payload_buffer, new_buffer_size, handle.get_stream());
     }
 
-    auto row_value_input_offset = GraphViewType::is_adj_matrix_transposed
-                                    ? vertex_t{0}
-                                    : matrix_partition.get_major_value_start_offset();
+    auto matrix_partition_row_value_input = adj_matrix_row_value_input;
+    auto matrix_partition_col_value_input = adj_matrix_col_value_input;
+    matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+
     if (segment_offsets) {
       static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
       std::vector<vertex_t> h_thresholds(detail::num_sparse_segments_per_vertex_partition +
@@ -1030,8 +1031,8 @@ void update_frontier_v_push_if_out_nbr(
             matrix_partition,
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[0],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<key_t>(key_buffer),
             detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
             buffer_idx.data(),
@@ -1047,8 +1048,8 @@ void update_frontier_v_push_if_out_nbr(
             matrix_partition,
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[0],
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[1],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<key_t>(key_buffer),
             detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
             buffer_idx.data(),
@@ -1064,8 +1065,8 @@ void update_frontier_v_push_if_out_nbr(
             matrix_partition,
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[1],
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[2],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<key_t>(key_buffer),
             detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
             buffer_idx.data(),
@@ -1082,8 +1083,8 @@ void update_frontier_v_push_if_out_nbr(
             matrix_partition.get_major_first() + (*segment_offsets)[3],
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[2],
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer) + h_offsets[3],
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<key_t>(key_buffer),
             detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
             buffer_idx.data(),
@@ -1101,8 +1102,8 @@ void update_frontier_v_push_if_out_nbr(
             matrix_partition,
             get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
             get_dataframe_buffer_end<key_t>(matrix_partition_frontier_key_buffer),
-            adj_matrix_row_value_input_first + row_value_input_offset,
-            adj_matrix_col_value_input_first,
+            matrix_partition_row_value_input,
+            matrix_partition_col_value_input,
             get_dataframe_buffer_begin<key_t>(key_buffer),
             detail::get_optional_payload_buffer_begin<payload_t>(payload_buffer),
             buffer_idx.data(),

From a6fec7e8d403566055260fce6c2afe885207f4d0 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:33:18 -0400
Subject: [PATCH 09/57] update algorithms to use row/col properties wrapper

---
 cpp/src/centrality/katz_centrality.cu         |  12 +-
 cpp/src/community/louvain.cuh                 | 300 ++++++++++--------
 .../components/weakly_connected_components.cu |  27 +-
 cpp/src/link_analysis/pagerank.cu             |  12 +-
 cpp/src/structure/coarsen_graph.cu            |  19 +-
 cpp/src/structure/graph_view.cu               |  16 +-
 cpp/src/structure/relabel.cu                  |   1 -
 cpp/src/traversal/bfs.cu                      |   5 +-
 cpp/src/traversal/sssp.cu                     |  25 +-
 9 files changed, 221 insertions(+), 196 deletions(-)

diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu
index a638694153b..7bbc03e254e 100644
--- a/cpp/src/centrality/katz_centrality.cu
+++ b/cpp/src/centrality/katz_centrality.cu
@@ -19,6 +19,7 @@
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
 #include <cugraph/prims/count_if_v.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/transform_reduce_v.cuh>
 #include <cugraph/utilities/error.hpp>
 
@@ -91,8 +92,7 @@ void katz_centrality(raft::handle_t const& handle,
   // old katz centrality values
   rmm::device_uvector<result_t> tmp_katz_centralities(
     pull_graph_view.get_number_of_local_vertices(), handle.get_stream());
-  rmm::device_uvector<result_t> adj_matrix_row_katz_centralities(
-    pull_graph_view.get_number_of_local_adj_matrix_partition_rows(), handle.get_stream());
+  row_properties_t<GraphViewType, result_t> adj_matrix_row_katz_centralities(handle, pull_graph_view);
   auto new_katz_centralities = katz_centralities;
   auto old_katz_centralities = tmp_katz_centralities.data();
   size_t iter{0};
@@ -100,14 +100,14 @@ void katz_centrality(raft::handle_t const& handle,
     std::swap(new_katz_centralities, old_katz_centralities);
 
     copy_to_adj_matrix_row(
-      handle, pull_graph_view, old_katz_centralities, adj_matrix_row_katz_centralities.begin());
+      handle, pull_graph_view, old_katz_centralities, adj_matrix_row_katz_centralities);
 
     copy_v_transform_reduce_in_nbr(
       handle,
       pull_graph_view,
-      adj_matrix_row_katz_centralities.begin(),
-      thrust::make_constant_iterator(0) /* dummy */,
-      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+      adj_matrix_row_katz_centralities.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      [alpha] __device__(vertex_t, vertex_t, weight_t w, auto src_val, auto) {
         return static_cast<result_t>(alpha * src_val * w);
       },
       betas != nullptr ? result_t{0.0} : beta,
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 09189c95e38..c1f1d4ca67d 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -23,6 +23,7 @@
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh>
 #include <cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh>
 #include <cugraph/prims/transform_reduce_e.cuh>
 #include <cugraph/prims/transform_reduce_v.cuh>
@@ -52,6 +53,8 @@ class Louvain {
                           graph_view_t::is_adj_matrix_transposed,
                           graph_view_t::is_multi_gpu>;
 
+  static_assert(!graph_view_t::is_adj_matrix_transposed);
+
   Louvain(raft::handle_t const& handle, graph_view_t const& graph_view)
     :
 #ifdef TIMING
@@ -60,12 +63,13 @@ class Louvain {
       handle_(handle),
       dendrogram_(std::make_unique<Dendrogram<vertex_t>>()),
       current_graph_view_(graph_view),
-      cluster_keys_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
-      cluster_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
-      vertex_weights_v_(graph_view.get_number_of_local_vertices(), handle.get_stream_view()),
-      src_vertex_weights_cache_v_(0, handle.get_stream_view()),
-      src_cluster_cache_v_(0, handle.get_stream_view()),
-      dst_cluster_cache_v_(0, handle.get_stream_view())
+      cluster_keys_v_(0, handle.get_stream_view()),
+      cluster_weights_v_(0, handle.get_stream_view()),
+      vertex_weights_v_(0, handle.get_stream()),
+      src_vertex_weights_cache_(),
+      next_clusters_v_(0, handle.get_stream_view()),
+      src_clusters_cache_(),
+      dst_clusters_cache_()
   {
   }
 
@@ -82,16 +86,16 @@ class Louvain {
     weight_t total_edge_weight = transform_reduce_e(
       handle_,
       current_graph_view_,
-      thrust::make_constant_iterator(0),
-      thrust::make_constant_iterator(0),
-      [] __device__(auto src, auto dst, weight_t wt, auto, auto) { return wt; },
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      [] __device__(auto, auto, weight_t wt, auto, auto) { return wt; },
       weight_t{0});
 
     while (dendrogram_->num_levels() < max_level) {
       //
       //  Initialize every cluster to reference each vertex to itself
       //
-      initialize_dendrogram_level(current_graph_view_.get_number_of_local_vertices());
+      initialize_dendrogram_level();
 
       compute_vertex_and_cluster_weights();
 
@@ -148,10 +152,11 @@ class Louvain {
   }
 
  protected:
-  void initialize_dendrogram_level(vertex_t num_vertices)
+  void initialize_dendrogram_level()
   {
-    dendrogram_->add_level(
-      current_graph_view_.get_local_vertex_first(), num_vertices, handle_.get_stream_view());
+    dendrogram_->add_level(current_graph_view_.get_local_vertex_first(),
+                           current_graph_view_.get_number_of_local_vertices(),
+                           handle_.get_stream_view());
 
     thrust::sequence(rmm::exec_policy(handle_.get_stream_view()),
                      dendrogram_->current_level_begin(),
@@ -160,7 +165,7 @@ class Louvain {
   }
 
  public:
-  weight_t modularity(weight_t total_edge_weight, weight_t resolution)
+  weight_t modularity(weight_t total_edge_weight, weight_t resolution) const
   {
     weight_t sum_degree_squared = thrust::transform_reduce(
       rmm::exec_policy(handle_.get_stream_view()),
@@ -170,7 +175,7 @@ class Louvain {
       weight_t{0},
       thrust::plus<weight_t>());
 
-    if (graph_t::is_multi_gpu) {
+    if (graph_view_t::is_multi_gpu) {
       sum_degree_squared =
         host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream());
     }
@@ -178,9 +183,15 @@ class Louvain {
     weight_t sum_internal = transform_reduce_e(
       handle_,
       current_graph_view_,
-      d_src_cluster_cache_,
-      d_dst_cluster_cache_,
-      [] __device__(auto src, auto dst, weight_t wt, auto src_cluster, auto nbr_cluster) {
+      graph_view_t::is_multi_gpu
+        ? src_clusters_cache_.device_view()
+        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+            next_clusters_v_.begin()),
+      graph_view_t::is_multi_gpu
+        ? dst_clusters_cache_.device_view()
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+            next_clusters_v_.begin()),
+      [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) {
         if (src_cluster == nbr_cluster) {
           return wt;
         } else {
@@ -213,10 +224,7 @@ class Louvain {
                vertex_weights_v_.size(),
                handle_.get_stream());
 
-    d_src_vertex_weights_cache_ =
-      cache_src_vertex_properties(vertex_weights_v_, src_vertex_weights_cache_v_);
-
-    if (graph_view_t::is_multi_gpu) {
+    if constexpr (graph_view_t::is_multi_gpu) {
       auto const comm_size = handle_.get_comms().get_size();
       rmm::device_uvector<vertex_t> rx_keys_v(0, handle_.get_stream_view());
       rmm::device_uvector<weight_t> rx_weights_v(0, handle_.get_stream_view());
@@ -238,49 +246,50 @@ class Louvain {
       cluster_weights_v_ = std::move(rx_weights_v);
     }
 
+    if (graph_view_t::is_multi_gpu) {
+      src_vertex_weights_cache_ =
+        row_properties_t<graph_view_t, weight_t>(handle_, current_graph_view_);
+      copy_to_adj_matrix_row(
+        handle_, current_graph_view_, vertex_weights_v_.begin(), src_vertex_weights_cache_);
+      vertex_weights_v_.resize(0, handle_.get_stream());
+      vertex_weights_v_.shrink_to_fit(handle_.get_stream());
+    }
+
     timer_stop(handle_.get_stream_view());
   }
 
   template <typename T>
-  T* cache_src_vertex_properties(rmm::device_uvector<T>& input, rmm::device_uvector<T>& src_cache_v)
+  void cache_src_properties(rmm::device_uvector<T>& input,
+                            row_properties_t<graph_view_t, T>& src_cache_)
   {
-    if (graph_view_t::is_multi_gpu) {
-      src_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_rows(),
-                         handle_.get_stream_view());
-      copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_v.begin());
-      return src_cache_v.begin();
-    } else {
-      return input.begin();
-    }
+    copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_);
   }
 
   template <typename T>
-  T* cache_dst_vertex_properties(rmm::device_uvector<T>& input, rmm::device_uvector<T>& dst_cache_v)
+  void cache_dst_properties(rmm::device_uvector<T>& input,
+                            col_properties_t<graph_view_t, T>& dst_cache_)
   {
-    if (graph_view_t::is_multi_gpu) {
-      dst_cache_v.resize(current_graph_view_.get_number_of_local_adj_matrix_partition_cols(),
-                         handle_.get_stream_view());
-      copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_v.begin());
-      return dst_cache_v.begin();
-    } else {
-      return input.begin();
-    }
+    copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_);
   }
 
   virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution)
   {
     timer_start("update_clustering");
 
-    rmm::device_uvector<vertex_t> next_cluster_v(dendrogram_->current_level_size(),
-                                                 handle_.get_stream_view());
+    next_clusters_v_ =
+      rmm::device_uvector<vertex_t>(dendrogram_->current_level_size(), handle_.get_stream());
 
-    raft::copy(next_cluster_v.begin(),
+    raft::copy(next_clusters_v_.begin(),
                dendrogram_->current_level_begin(),
                dendrogram_->current_level_size(),
                handle_.get_stream());
 
-    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
-    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
+    if constexpr (graph_view_t::is_multi_gpu) {
+      src_clusters_cache_ = row_properties_t<graph_view_t, vertex_t>(handle_, current_graph_view_);
+      cache_src_properties(next_clusters_v_, src_clusters_cache_);
+      dst_clusters_cache_ = col_properties_t<graph_view_t, vertex_t>(handle_, current_graph_view_);
+      cache_dst_properties(next_clusters_v_, dst_clusters_cache_);
+    }
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
     weight_t cur_Q = new_Q - 1;
@@ -293,7 +302,7 @@ class Louvain {
     while (new_Q > (cur_Q + 0.0001)) {
       cur_Q = new_Q;
 
-      update_by_delta_modularity(total_edge_weight, resolution, next_cluster_v, up_down);
+      update_by_delta_modularity(total_edge_weight, resolution, next_clusters_v_, up_down);
 
       up_down = !up_down;
 
@@ -301,8 +310,8 @@ class Louvain {
 
       if (new_Q > cur_Q) {
         raft::copy(dendrogram_->current_level_begin(),
-                   next_cluster_v.begin(),
-                   next_cluster_v.size(),
+                   next_clusters_v_.begin(),
+                   next_clusters_v_.size(),
                    handle_.get_stream());
       }
     }
@@ -311,86 +320,67 @@ class Louvain {
     return cur_Q;
   }
 
-  void compute_cluster_sum_and_subtract(rmm::device_uvector<weight_t>& old_cluster_sum_v,
-                                        rmm::device_uvector<weight_t>& cluster_subtract_v)
+  std::tuple<rmm::device_uvector<weight_t>, rmm::device_uvector<weight_t>>
+  compute_cluster_sum_and_subtract() const
   {
-    auto output_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<weight_t, weight_t>>(
-      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream_view());
+    rmm::device_uvector<weight_t> old_cluster_sum_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+    rmm::device_uvector<weight_t> cluster_subtract_v(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
 
     copy_v_transform_reduce_out_nbr(
       handle_,
       current_graph_view_,
-      d_src_cluster_cache_,
-      d_dst_cluster_cache_,
+      graph_view_t::is_multi_gpu
+        ? src_clusters_cache_.device_view()
+        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+            next_clusters_v_.data()),
+      graph_view_t::is_multi_gpu
+        ? dst_clusters_cache_.device_view()
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+            next_clusters_v_.data()),
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
-        weight_t subtract{0};
         weight_t sum{0};
+        weight_t subtract{0};
 
         if (src == dst)
           subtract = wt;
         else if (src_cluster == nbr_cluster)
           sum = wt;
 
-        return thrust::make_tuple(subtract, sum);
+        return thrust::make_tuple(sum, subtract);
       },
       thrust::make_tuple(weight_t{0}, weight_t{0}),
-      cugraph::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(output_buffer));
-
-    thrust::transform(
-      rmm::exec_policy(handle_.get_stream_view()),
-      cugraph::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(output_buffer),
-      cugraph::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(output_buffer) +
-        current_graph_view_.get_number_of_local_vertices(),
-      old_cluster_sum_v.begin(),
-      [] __device__(auto p) { return thrust::get<1>(p); });
+      thrust::make_zip_iterator(old_cluster_sum_v.begin(), cluster_subtract_v.begin()));
 
-    thrust::transform(
-      rmm::exec_policy(handle_.get_stream_view()),
-      cugraph::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(output_buffer),
-      cugraph::get_dataframe_buffer_begin<thrust::tuple<weight_t, weight_t>>(output_buffer) +
-        current_graph_view_.get_number_of_local_vertices(),
-      cluster_subtract_v.begin(),
-      [] __device__(auto p) { return thrust::get<0>(p); });
+    return std::make_tuple(std::move(old_cluster_sum_v), std::move(cluster_subtract_v));
   }
 
   void update_by_delta_modularity(weight_t total_edge_weight,
                                   weight_t resolution,
-                                  rmm::device_uvector<vertex_t>& next_cluster_v,
+                                  rmm::device_uvector<vertex_t>& next_clusters_v_,
                                   bool up_down)
   {
-    rmm::device_uvector<weight_t> old_cluster_sum_v(
-      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
-    rmm::device_uvector<weight_t> cluster_subtract_v(
-      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
-    rmm::device_uvector<weight_t> src_cluster_weights_v(next_cluster_v.size(),
-                                                        handle_.get_stream());
-
-    compute_cluster_sum_and_subtract(old_cluster_sum_v, cluster_subtract_v);
-
-    auto output_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
-      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
-
-    vertex_t* map_key_first;
-    vertex_t* map_key_last;
-    weight_t* map_value_first;
-
-    if (graph_t::is_multi_gpu) {
+    rmm::device_uvector<weight_t> vertex_cluster_weights_v(0, handle_.get_stream());
+    row_properties_t<graph_view_t, weight_t> src_cluster_weights{};
+    if constexpr (graph_view_t::is_multi_gpu) {
       cugraph::detail::compute_gpu_id_from_vertex_t<vertex_t> vertex_to_gpu_id_op{
         handle_.get_comms().get_size()};
 
-      src_cluster_weights_v =
-        cugraph::collect_values_for_keys(handle_.get_comms(),
-                                         cluster_keys_v_.begin(),
-                                         cluster_keys_v_.end(),
-                                         cluster_weights_v_.data(),
-                                         d_src_cluster_cache_,
-                                         d_src_cluster_cache_ + src_cluster_cache_v_.size(),
-                                         vertex_to_gpu_id_op,
-                                         handle_.get_stream());
-
-      map_key_first   = cluster_keys_v_.begin();
-      map_key_last    = cluster_keys_v_.end();
-      map_value_first = cluster_weights_v_.begin();
+      vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle_.get_comms(),
+                                                                  cluster_keys_v_.begin(),
+                                                                  cluster_keys_v_.end(),
+                                                                  cluster_weights_v_.data(),
+                                                                  next_clusters_v_.begin(),
+                                                                  next_clusters_v_.end(),
+                                                                  vertex_to_gpu_id_op,
+                                                                  handle_.get_stream());
+
+      src_cluster_weights = row_properties_t<graph_view_t, weight_t>(handle_, current_graph_view_);
+      copy_to_adj_matrix_row(
+        handle_, current_graph_view_, vertex_cluster_weights_v.begin(), src_cluster_weights);
+      vertex_cluster_weights_v.resize(0, handle_.get_stream());
+      vertex_cluster_weights_v.shrink_to_fit(handle_.get_stream());
     } else {
       thrust::sort_by_key(rmm::exec_policy(handle_.get_stream_view()),
                           cluster_keys_v_.begin(),
@@ -398,9 +388,9 @@ class Louvain {
                           cluster_weights_v_.begin());
 
       thrust::transform(rmm::exec_policy(handle_.get_stream_view()),
-                        next_cluster_v.begin(),
-                        next_cluster_v.end(),
-                        src_cluster_weights_v.begin(),
+                        next_clusters_v_.begin(),
+                        next_clusters_v_.end(),
+                        vertex_cluster_weights_v.begin(),
                         [d_cluster_weights = cluster_weights_v_.data(),
                          d_cluster_keys    = cluster_keys_v_.data(),
                          num_clusters      = cluster_keys_v_.size()] __device__(vertex_t cluster) {
@@ -408,24 +398,51 @@ class Louvain {
                             thrust::seq, d_cluster_keys, d_cluster_keys + num_clusters, cluster);
                           return d_cluster_weights[pos - d_cluster_keys];
                         });
+    }
 
-      map_key_first   = d_src_cluster_cache_;
-      map_key_last    = d_src_cluster_cache_ + src_cluster_weights_v.size();
-      map_value_first = src_cluster_weights_v.begin();
+    auto [old_cluster_sum_v, cluster_subtract_v] = compute_cluster_sum_and_subtract();
+
+    row_properties_t<graph_view_t, thrust::tuple<weight_t, weight_t>>
+    src_old_cluster_sum_subtract_pairs{};
+    if constexpr (graph_view_t::is_multi_gpu) {
+      src_old_cluster_sum_subtract_pairs =
+        row_properties_t<graph_view_t, thrust::tuple<weight_t, weight_t>>(handle_,
+                                                                          current_graph_view_);
+      copy_to_adj_matrix_row(handle_,
+                             current_graph_view_,
+                             thrust::make_zip_iterator(thrust::make_tuple(
+                               old_cluster_sum_v.begin(), cluster_subtract_v.begin())),
+                             src_old_cluster_sum_subtract_pairs);
     }
 
-    rmm::device_uvector<weight_t> src_old_cluster_sum_v(
-      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
-    rmm::device_uvector<weight_t> src_cluster_subtract_v(
-      current_graph_view_.get_number_of_local_adj_matrix_partition_rows(), handle_.get_stream());
-    copy_to_adj_matrix_row(
-      handle_, current_graph_view_, old_cluster_sum_v.begin(), src_old_cluster_sum_v.begin());
-    copy_to_adj_matrix_row(
-      handle_, current_graph_view_, cluster_subtract_v.begin(), src_cluster_subtract_v.begin());
+    auto output_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
+      current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
+
+    auto cluster_old_sum_subtract_pair_first = thrust::make_zip_iterator(
+      thrust::make_tuple(old_cluster_sum_v.cbegin(), cluster_subtract_v.cbegin()));
+    auto zipped_src_device_view =
+      graph_view_t::is_multi_gpu
+        ? device_view_concat(src_vertex_weights_cache_.device_view(),
+                          src_clusters_cache_.device_view(),
+                          src_cluster_weights.device_view(),
+                          src_old_cluster_sum_subtract_pairs.device_view())
+        : device_view_concat(
+            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
+              vertex_weights_v_.data()),
+            detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+              next_clusters_v_.data()),
+            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
+              vertex_cluster_weights_v.data()),
+            detail::major_properties_device_view_t<vertex_t, decltype(cluster_old_sum_subtract_pair_first)>(
+              cluster_old_sum_subtract_pair_first));
 
     copy_v_transform_reduce_key_aggregated_out_nbr(
       handle_,
       current_graph_view_,
+#if 1
+      zipped_src_device_view,
+      graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(next_clusters_v_.data()),
+#else
       thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(),
                                                    d_src_vertex_weights_cache_,
                                                    src_cluster_subtract_v.begin(),
@@ -433,16 +450,25 @@ class Louvain {
                                                    src_cluster_weights_v.begin())),
 
       d_dst_cluster_cache_,
-      map_key_first,
-      map_key_last,
-      map_value_first,
+#endif
+      cluster_keys_v_.begin(),
+      cluster_keys_v_.end(),
+      cluster_weights_v_.begin(),
       [total_edge_weight, resolution] __device__(
         auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
+#if 1
+        auto k_k              = thrust::get<0>(src_info);
+        auto src_cluster      = thrust::get<1>(src_info);
+        auto a_old            = thrust::get<2>(src_info);
+        auto old_cluster_sum  = thrust::get<3>(src_info);
+        auto cluster_subtract = thrust::get<4>(src_info);
+#else
         auto old_cluster_sum  = thrust::get<0>(src_info);
         auto k_k              = thrust::get<1>(src_info);
         auto cluster_subtract = thrust::get<2>(src_info);
         auto src_cluster      = thrust::get<3>(src_info);
         auto a_old            = thrust::get<4>(src_info);
+#endif
 
         if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
 
@@ -465,10 +491,10 @@ class Louvain {
 
     thrust::transform(
       rmm::exec_policy(handle_.get_stream_view()),
-      next_cluster_v.begin(),
-      next_cluster_v.end(),
+      next_clusters_v_.begin(),
+      next_clusters_v_.end(),
       cugraph::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(output_buffer),
-      next_cluster_v.begin(),
+      next_clusters_v_.begin(),
       [up_down] __device__(vertex_t old_cluster, auto p) {
         vertex_t new_cluster      = thrust::get<0>(p);
         weight_t delta_modularity = thrust::get<1>(p);
@@ -478,17 +504,19 @@ class Louvain {
                  : old_cluster;
       });
 
-    d_src_cluster_cache_ = cache_src_vertex_properties(next_cluster_v, src_cluster_cache_v_);
-    d_dst_cluster_cache_ = cache_dst_vertex_properties(next_cluster_v, dst_cluster_cache_v_);
+    if constexpr (graph_view_t::is_multi_gpu) {
+      cache_src_properties(next_clusters_v_, src_clusters_cache_);
+      cache_dst_properties(next_clusters_v_, dst_clusters_cache_);
+    }
 
     std::tie(cluster_keys_v_, cluster_weights_v_) =
       cugraph::transform_reduce_by_adj_matrix_row_key_e(
         handle_,
         current_graph_view_,
-        thrust::make_constant_iterator(0),
-        thrust::make_constant_iterator(0),
-        d_src_cluster_cache_,
-        [] __device__(auto src, auto dst, auto wt, auto x, auto y) { return wt; },
+        dummy_properties_t<vertex_t>{}.device_view(),
+        dummy_properties_t<vertex_t>{}.device_view(),
+        src_clusters_cache_.device_view(),
+        [] __device__(auto, auto, auto wt, auto, auto) { return wt; },
         weight_t{0});
   }
 
@@ -534,16 +562,16 @@ class Louvain {
   std::unique_ptr<graph_t> current_graph_{};
   graph_view_t current_graph_view_;
 
-  rmm::device_uvector<weight_t> vertex_weights_v_;
-  rmm::device_uvector<weight_t> src_vertex_weights_cache_v_;
-  rmm::device_uvector<vertex_t> src_cluster_cache_v_;
-  rmm::device_uvector<vertex_t> dst_cluster_cache_v_;
+  // FIXME: better move inside the update_by_delta_modularity?
   rmm::device_uvector<vertex_t> cluster_keys_v_;
   rmm::device_uvector<weight_t> cluster_weights_v_;
 
-  weight_t* d_src_vertex_weights_cache_;
-  vertex_t* d_src_cluster_cache_;
-  vertex_t* d_dst_cluster_cache_;
+  rmm::device_uvector<weight_t> vertex_weights_v_;
+  row_properties_t<graph_view_t, weight_t> src_vertex_weights_cache_;  // src cache for vertex_weights_v_
+
+  rmm::device_uvector<vertex_t> next_clusters_v_;
+  row_properties_t<graph_view_t, vertex_t> src_clusters_cache_;  // src cache for next_clusters_v_
+  col_properties_t<graph_view_t, vertex_t> dst_clusters_cache_;  // dst cache for next_clusters_v_
 
 #ifdef TIMING
   HighResTimer hr_timer_;
diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu
index f20356a6d58..00b211976de 100644
--- a/cpp/src/components/weakly_connected_components.cu
+++ b/cpp/src/components/weakly_connected_components.cu
@@ -19,6 +19,7 @@
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
 #include <cugraph/prims/vertex_frontier.cuh>
 #include <cugraph/utilities/device_comm.cuh>
@@ -442,7 +443,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
       init_max_new_roots = std::min(init_max_new_roots, max_new_roots);
     }
 
-    // 2-3. initialize vertex frontier, edge_buffer, and col_components (if multi-gpu)
+    // 2-3. initialize vertex frontier, edge_buffer, and adj_matrix_col_components (if multi-gpu)
 
     VertexFrontier<vertex_t,
                    vertex_t,
@@ -458,15 +459,9 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
     // requires placing the atomic variable on managed memory and this make it less attractive.
     rmm::device_scalar<size_t> num_edge_inserts(size_t{0}, handle.get_stream_view());
 
-    rmm::device_uvector<vertex_t> col_components(
-      GraphViewType::is_multi_gpu ? level_graph_view.get_number_of_local_adj_matrix_partition_cols()
-                                  : vertex_t{0},
-      handle.get_stream_view());
-    if (GraphViewType::is_multi_gpu) {
-      thrust::fill(rmm::exec_policy(handle.get_stream_view()),
-                   col_components.begin(),
-                   col_components.end(),
-                   invalid_component_id<vertex_t>::value);
+    auto adj_matrix_col_components = GraphViewType::is_multi_gpu ? col_properties_t<GraphViewType, vertex_t>(handle, level_graph_view) : col_properties_t<GraphViewType, vertex_t>();
+    if constexpr (GraphViewType::is_multi_gpu) {
+      adj_matrix_col_components.fill(invalid_component_id<vertex_t>::value, handle.get_stream());
     }
 
     // 2.4 iterate till every vertex gets visited
@@ -508,7 +503,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         break;
       }
 
-      if (GraphViewType::is_multi_gpu) {
+      if constexpr (GraphViewType::is_multi_gpu) {
         copy_to_adj_matrix_col(
           handle,
           level_graph_view,
@@ -519,7 +514,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
                            .end()
                            .get_iterator_tuple()),
           level_components,
-          col_components.begin());
+          adj_matrix_col_components);
       }
 
       auto max_pushes =
@@ -543,9 +538,9 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         GraphViewType::is_multi_gpu ? std::vector<size_t>{static_cast<size_t>(Bucket::next),
                                                           static_cast<size_t>(Bucket::conflict)}
                                     : std::vector<size_t>{static_cast<size_t>(Bucket::next)},
-        thrust::make_counting_iterator(0) /* dummy */,
-        thrust::make_counting_iterator(0) /* dummy */,
-        [col_components = GraphViewType::is_multi_gpu ? col_components.data() : level_components,
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+        [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
          col_first      = level_graph_view.get_local_adj_matrix_partition_col_first(),
          edge_buffer_first =
            get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
@@ -556,7 +551,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
           // FIXME: better switch to atomic_ref after
           // https://github.com/nvidia/libcudacxx/milestone/2
           auto old =
-            atomicCAS(col_components + col_offset, invalid_component_id<vertex_t>::value, tag);
+            atomicCAS(&(col_components.get(col_offset)), invalid_component_id<vertex_t>::value, tag);
           if (old != invalid_component_id<vertex_t>::value && old != tag) {  // conflict
             static_assert(sizeof(unsigned long long int) == sizeof(size_t));
             auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),
diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
index 2d38371f7fc..b4fff1ce3e2 100644
--- a/cpp/src/link_analysis/pagerank.cu
+++ b/cpp/src/link_analysis/pagerank.cu
@@ -101,9 +101,9 @@ void pagerank(
       auto num_nonpositive_edge_weights = count_if_e(
         handle,
         pull_graph_view,
-        thrust::make_constant_iterator(0) /* dummy */,
-        thrust::make_constant_iterator(0) /* dummy */,
-        [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        dummy_properties_t<vertex_t>{}.device_view(),
+        dummy_properties_t<vertex_t>{}.device_view(),
+        [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) {
           return w <= 0.0;
         });
       CUGRAPH_EXPECTS(num_nonpositive_edge_weights == 0,
@@ -233,9 +233,9 @@ void pagerank(
     copy_v_transform_reduce_in_nbr(
       handle,
       pull_graph_view,
-      adj_matrix_row_pageranks.begin(),
-      dummy_properties_t{}.begin(),
-      [alpha] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+      adj_matrix_row_pageranks.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      [alpha] __device__(vertex_t, vertex_t, weight_t w, auto src_val, auto) {
         return src_val * w * alpha;
       },
       unvarying_part,
diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu
index 967234b9e54..5bfa1e7456c 100644
--- a/cpp/src/structure/coarsen_graph.cu
+++ b/cpp/src/structure/coarsen_graph.cu
@@ -131,22 +131,19 @@ template <typename vertex_t,
           typename edge_t,
           typename weight_t,
           bool multi_gpu,
-          typename VertexIterator0,
-          typename VertexIterator1>
+          typename AdjMatrixMinorLabelInputWrapper>
 std::tuple<rmm::device_uvector<vertex_t>,
            rmm::device_uvector<vertex_t>,
            std::optional<rmm::device_uvector<weight_t>>>
 decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
   raft::handle_t const& handle,
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> const matrix_partition,
-  VertexIterator0 const major_label_first,
-  VertexIterator1 const minor_label_first,
+  vertex_t const* major_label_first,
+  AdjMatrixMinorLabelInputWrapper const minor_label_input,
   std::optional<std::vector<vertex_t>> const& segment_offsets)
 {
   static_assert(
-    std::is_same_v<typename thrust::iterator_traits<VertexIterator0>::value_type, vertex_t>);
-  static_assert(
-    std::is_same_v<typename thrust::iterator_traits<VertexIterator1>::value_type, vertex_t>);
+    std::is_same_v<typename AdjMatrixMinorLabelInputWrapper::value_type, vertex_t>);
 
   // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
   // compressed sparse format to save memory
@@ -161,12 +158,12 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
                     pair_first + edgelist_major_vertices.size(),
                     pair_first,
                     [major_label_first,
-                     minor_label_first,
+                     minor_label_input,
                      major_first = matrix_partition.get_major_first(),
                      minor_first = matrix_partition.get_minor_first()] __device__(auto val) {
                       return thrust::make_tuple(
                         *(major_label_first + (thrust::get<0>(val) - major_first)),
-                        *(minor_label_first + (thrust::get<1>(val) - minor_first)));
+                        minor_label_input.get(thrust::get<1>(val) - minor_first));
                     });
 
   auto number_of_edges = groupby_e_and_coarsen_edgelist(
@@ -304,7 +301,7 @@ coarsen_graph(
         matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
           graph_view.get_matrix_partition_view(i)),
         major_labels.data(),
-        adj_matrix_minor_labels.begin(),
+        adj_matrix_minor_labels.device_view(),
         graph_view.get_local_adj_matrix_partition_segment_offsets(i));
 
     // 1-2. globally shuffle
@@ -512,7 +509,7 @@ coarsen_graph(
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
         graph_view.get_matrix_partition_view()),
       labels,
-      labels,
+      detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(labels),
       graph_view.get_local_adj_matrix_partition_segment_offsets(0));
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
diff --git a/cpp/src/structure/graph_view.cu b/cpp/src/structure/graph_view.cu
index 05de14afd19..6c22fbac24c 100644
--- a/cpp/src/structure/graph_view.cu
+++ b/cpp/src/structure/graph_view.cu
@@ -93,8 +93,8 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
     copy_v_transform_reduce_out_nbr(
       handle,
       graph_view,
-      dummy_properties_t{}.begin(),
-      dummy_properties_t{}.begin(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
       [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; },
       edge_t{0},
       minor_degrees.data());
@@ -102,8 +102,8 @@ rmm::device_uvector<edge_t> compute_minor_degrees(
     copy_v_transform_reduce_in_nbr(
       handle,
       graph_view,
-      dummy_properties_t{}.begin(),
-      dummy_properties_t{}.begin(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
       [] __device__(vertex_t, vertex_t, weight_t, auto, auto) { return edge_t{1}; },
       edge_t{0},
       minor_degrees.data());
@@ -128,8 +128,8 @@ rmm::device_uvector<weight_t> compute_weight_sums(
     copy_v_transform_reduce_in_nbr(
       handle,
       graph_view,
-      dummy_properties_t{}.begin(),
-      dummy_properties_t{}.begin(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
       [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; },
       weight_t{0.0},
       weight_sums.data());
@@ -137,8 +137,8 @@ rmm::device_uvector<weight_t> compute_weight_sums(
     copy_v_transform_reduce_out_nbr(
       handle,
       graph_view,
-      dummy_properties_t{}.begin(),
-      dummy_properties_t{}.begin(),
+      dummy_properties_t<vertex_t>{}.device_view(),
+      dummy_properties_t<vertex_t>{}.device_view(),
       [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w; },
       weight_t{0.0},
       weight_sums.data());
diff --git a/cpp/src/structure/relabel.cu b/cpp/src/structure/relabel.cu
index d01143a922e..b68b10b1838 100644
--- a/cpp/src/structure/relabel.cu
+++ b/cpp/src/structure/relabel.cu
@@ -18,7 +18,6 @@
 #include <cugraph/graph.hpp>
 #include <cugraph/graph_functions.hpp>
 #include <cugraph/graph_view.hpp>
-#include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/utilities/error.hpp>
 #include <cugraph/utilities/shuffle_comm.cuh>
 
diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu
index fa653b7ddb3..70a6c72bc10 100644
--- a/cpp/src/traversal/bfs.cu
+++ b/cpp/src/traversal/bfs.cu
@@ -17,6 +17,7 @@
 #include <cugraph/algorithms.hpp>
 #include <cugraph/graph_view.hpp>
 #include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
 #include <cugraph/prims/vertex_frontier.cuh>
 #include <cugraph/utilities/error.hpp>
@@ -117,8 +118,8 @@ void bfs(raft::handle_t const& handle,
         vertex_frontier,
         static_cast<size_t>(Bucket::cur),
         std::vector<size_t>{static_cast<size_t>(Bucket::next)},
-        thrust::make_constant_iterator(0) /* dummy */,
-        thrust::make_constant_iterator(0) /* dummy */,
+        dummy_properties_t<vertex_t>{}.device_view(),
+        dummy_properties_t<vertex_t>{}.device_view(),
         [vertex_partition, distances] __device__(
           vertex_t src, vertex_t dst, auto src_val, auto dst_val) {
           auto push = true;
diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu
index d1cf68741f9..6c3595723d8 100644
--- a/cpp/src/traversal/sssp.cu
+++ b/cpp/src/traversal/sssp.cu
@@ -19,6 +19,7 @@
 #include <cugraph/prims/copy_to_adj_matrix_row_col.cuh>
 #include <cugraph/prims/count_if_e.cuh>
 #include <cugraph/prims/reduce_op.cuh>
+#include <cugraph/prims/row_col_properties.cuh>
 #include <cugraph/prims/transform_reduce_e.cuh>
 #include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
 #include <cugraph/prims/vertex_frontier.cuh>
@@ -78,9 +79,9 @@ void sssp(raft::handle_t const& handle,
     auto num_negative_edge_weights =
       count_if_e(handle,
                  push_graph_view,
-                 thrust::make_constant_iterator(0) /* dummy */,
-                 thrust::make_constant_iterator(0) /* dummy */,
-                 [] __device__(vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+    dummy_properties_t<vertex_t>{}.device_view(),
+    dummy_properties_t<vertex_t>{}.device_view(),
+                 [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) {
                    return w < 0.0;
                  });
     CUGRAPH_EXPECTS(num_negative_edge_weights == 0,
@@ -112,9 +113,9 @@ void sssp(raft::handle_t const& handle,
   thrust::tie(average_vertex_degree, average_edge_weight) = transform_reduce_e(
     handle,
     push_graph_view,
-    thrust::make_constant_iterator(0) /* dummy */,
-    thrust::make_constant_iterator(0) /* dummy */,
-    [] __device__(vertex_t row, vertex_t col, weight_t w, auto row_val, auto col_val) {
+    dummy_properties_t<vertex_t>{}.device_view(),
+    dummy_properties_t<vertex_t>{}.device_view(),
+    [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) {
       return thrust::make_tuple(weight_t{1.0}, w);
     },
     thrust::make_tuple(weight_t{0.0}, weight_t{0.0}));
@@ -134,8 +135,10 @@ void sssp(raft::handle_t const& handle,
 
   // 5. SSSP iteration
 
-  row_properties_t<GraphViewType, weight_t> adj_matrix_row_distances(handle, push_graph_view);
+  auto adj_matrix_row_distances = GraphViewType::is_multi_gpu ? row_properties_t<GraphViewType, weight_t>(handle, push_graph_view) : row_properties_t<GraphViewType, weight_t>{};
+  if (GraphViewType::is_multi_gpu) {
   adj_matrix_row_distances.fill(std::numeric_limits<weight_t>::max(), handle.get_stream());
+  }
 
   if (push_graph_view.is_local_vertex_nocheck(source_vertex)) {
     vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).insert(source_vertex);
@@ -143,6 +146,7 @@ void sssp(raft::handle_t const& handle,
 
   auto near_far_threshold = delta;
   while (true) {
+  if (GraphViewType::is_multi_gpu) {
     copy_to_adj_matrix_row(
       handle,
       push_graph_view,
@@ -150,6 +154,7 @@ void sssp(raft::handle_t const& handle,
       vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
       distances,
       adj_matrix_row_distances);
+    }
 
     auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
       push_graph_view.get_vertex_partition_view());
@@ -160,10 +165,10 @@ void sssp(raft::handle_t const& handle,
       vertex_frontier,
       static_cast<size_t>(Bucket::cur_near),
       std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
-      adj_matrix_row_distances.begin(),
-      thrust::make_constant_iterator(0) /* dummy */,
+      GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() : detail::major_properties_device_view_t<vertex_t, weight_t const*>(distances),
+      dummy_properties_t<vertex_t>{}.device_view(),
       [vertex_partition, distances, cutoff] __device__(
-        vertex_t src, vertex_t dst, weight_t w, auto src_val, auto dst_val) {
+        vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) {
         auto push         = true;
         auto new_distance = src_val + w;
         auto threshold    = cutoff;

From 670d891776eaa7feb134ed2d70cf98e593e4e7ef Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:44:44 -0400
Subject: [PATCH 10/57] clang-format

---
 cpp/src/centrality/katz_centrality.cu         |  3 +-
 cpp/src/community/louvain.cuh                 | 19 +++++++----
 .../components/weakly_connected_components.cu | 20 ++++++++----
 cpp/src/link_analysis/pagerank.cu             | 14 ++++----
 cpp/src/structure/coarsen_graph.cu            |  4 +--
 cpp/src/traversal/sssp.cu                     | 32 ++++++++++---------
 6 files changed, 51 insertions(+), 41 deletions(-)

diff --git a/cpp/src/centrality/katz_centrality.cu b/cpp/src/centrality/katz_centrality.cu
index 884dacc925f..a9722b448e1 100644
--- a/cpp/src/centrality/katz_centrality.cu
+++ b/cpp/src/centrality/katz_centrality.cu
@@ -92,7 +92,8 @@ void katz_centrality(raft::handle_t const& handle,
   // old katz centrality values
   rmm::device_uvector<result_t> tmp_katz_centralities(
     pull_graph_view.get_number_of_local_vertices(), handle.get_stream());
-  row_properties_t<GraphViewType, result_t> adj_matrix_row_katz_centralities(handle, pull_graph_view);
+  row_properties_t<GraphViewType, result_t> adj_matrix_row_katz_centralities(handle,
+                                                                             pull_graph_view);
   auto new_katz_centralities = katz_centralities;
   auto old_katz_centralities = tmp_katz_centralities.data();
   size_t iter{0};
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 3d262930fb3..aff64cad704 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -406,7 +406,7 @@ class Louvain {
     auto [old_cluster_sum_v, cluster_subtract_v] = compute_cluster_sum_and_subtract();
 
     row_properties_t<graph_view_t, thrust::tuple<weight_t, weight_t>>
-    src_old_cluster_sum_subtract_pairs{};
+      src_old_cluster_sum_subtract_pairs{};
     if constexpr (graph_view_t::is_multi_gpu) {
       src_old_cluster_sum_subtract_pairs =
         row_properties_t<graph_view_t, thrust::tuple<weight_t, weight_t>>(handle_,
@@ -426,9 +426,9 @@ class Louvain {
     auto zipped_src_device_view =
       graph_view_t::is_multi_gpu
         ? device_view_concat(src_vertex_weights_cache_.device_view(),
-                          src_clusters_cache_.device_view(),
-                          src_cluster_weights.device_view(),
-                          src_old_cluster_sum_subtract_pairs.device_view())
+                             src_clusters_cache_.device_view(),
+                             src_cluster_weights.device_view(),
+                             src_old_cluster_sum_subtract_pairs.device_view())
         : device_view_concat(
             detail::major_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_weights_v_.data()),
@@ -436,7 +436,8 @@ class Louvain {
               next_clusters_v_.data()),
             detail::major_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_cluster_weights_v.data()),
-            detail::major_properties_device_view_t<vertex_t, decltype(cluster_old_sum_subtract_pair_first)>(
+            detail::major_properties_device_view_t<vertex_t,
+                                                   decltype(cluster_old_sum_subtract_pair_first)>(
               cluster_old_sum_subtract_pair_first));
 
     copy_v_transform_reduce_key_aggregated_out_nbr(
@@ -444,7 +445,10 @@ class Louvain {
       current_graph_view_,
 #if 1
       zipped_src_device_view,
-      graph_view_t::is_multi_gpu ? dst_clusters_cache_.device_view() : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(next_clusters_v_.data()),
+      graph_view_t::is_multi_gpu
+        ? dst_clusters_cache_.device_view()
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+            next_clusters_v_.data()),
 #else
       thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(),
                                                    d_src_vertex_weights_cache_,
@@ -570,7 +574,8 @@ class Louvain {
   rmm::device_uvector<weight_t> cluster_weights_v_;
 
   rmm::device_uvector<weight_t> vertex_weights_v_;
-  row_properties_t<graph_view_t, weight_t> src_vertex_weights_cache_;  // src cache for vertex_weights_v_
+  row_properties_t<graph_view_t, weight_t>
+    src_vertex_weights_cache_;  // src cache for vertex_weights_v_
 
   rmm::device_uvector<vertex_t> next_clusters_v_;
   row_properties_t<graph_view_t, vertex_t> src_clusters_cache_;  // src cache for next_clusters_v_
diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu
index 26abb0f9f2a..35f4343f721 100644
--- a/cpp/src/components/weakly_connected_components.cu
+++ b/cpp/src/components/weakly_connected_components.cu
@@ -458,7 +458,10 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
     // requires placing the atomic variable on managed memory and this make it less attractive.
     rmm::device_scalar<size_t> num_edge_inserts(size_t{0}, handle.get_stream_view());
 
-    auto adj_matrix_col_components = GraphViewType::is_multi_gpu ? col_properties_t<GraphViewType, vertex_t>(handle, level_graph_view) : col_properties_t<GraphViewType, vertex_t>();
+    auto adj_matrix_col_components =
+      GraphViewType::is_multi_gpu
+        ? col_properties_t<GraphViewType, vertex_t>(handle, level_graph_view)
+        : col_properties_t<GraphViewType, vertex_t>();
     if constexpr (GraphViewType::is_multi_gpu) {
       adj_matrix_col_components.fill(invalid_component_id<vertex_t>::value, handle.get_stream());
     }
@@ -536,10 +539,13 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         GraphViewType::is_multi_gpu ? std::vector<size_t>{static_cast<size_t>(Bucket::next),
                                                           static_cast<size_t>(Bucket::conflict)}
                                     : std::vector<size_t>{static_cast<size_t>(Bucket::next)},
-      dummy_properties_t<vertex_t>{}.device_view(),
-      dummy_properties_t<vertex_t>{}.device_view(),
-        [col_components = GraphViewType::is_multi_gpu ? adj_matrix_col_components.mutable_device_view() : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
-         col_first      = level_graph_view.get_local_adj_matrix_partition_col_first(),
+        dummy_properties_t<vertex_t>{}.device_view(),
+        dummy_properties_t<vertex_t>{}.device_view(),
+        [col_components =
+           GraphViewType::is_multi_gpu
+             ? adj_matrix_col_components.mutable_device_view()
+             : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
+         col_first = level_graph_view.get_local_adj_matrix_partition_col_first(),
          edge_buffer_first =
            get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
          num_edge_inserts =
@@ -548,8 +554,8 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
           auto col_offset = dst - col_first;
           // FIXME: better switch to atomic_ref after
           // https://github.com/nvidia/libcudacxx/milestone/2
-          auto old =
-            atomicCAS(&(col_components.get(col_offset)), invalid_component_id<vertex_t>::value, tag);
+          auto old = atomicCAS(
+            &(col_components.get(col_offset)), invalid_component_id<vertex_t>::value, tag);
           if (old != invalid_component_id<vertex_t>::value && old != tag) {  // conflict
             static_assert(sizeof(unsigned long long int) == sizeof(size_t));
             auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),
diff --git a/cpp/src/link_analysis/pagerank.cu b/cpp/src/link_analysis/pagerank.cu
index d06677a532d..fe6a182f365 100644
--- a/cpp/src/link_analysis/pagerank.cu
+++ b/cpp/src/link_analysis/pagerank.cu
@@ -98,14 +98,12 @@ void pagerank(
     }
 
     if (pull_graph_view.is_weighted()) {
-      auto num_nonpositive_edge_weights = count_if_e(
-        handle,
-        pull_graph_view,
-        dummy_properties_t<vertex_t>{}.device_view(),
-        dummy_properties_t<vertex_t>{}.device_view(),
-        [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) {
-          return w <= 0.0;
-        });
+      auto num_nonpositive_edge_weights =
+        count_if_e(handle,
+                   pull_graph_view,
+                   dummy_properties_t<vertex_t>{}.device_view(),
+                   dummy_properties_t<vertex_t>{}.device_view(),
+                   [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w <= 0.0; });
       CUGRAPH_EXPECTS(num_nonpositive_edge_weights == 0,
                       "Invalid input argument: input graph should have postive edge weights.");
     }
diff --git a/cpp/src/structure/coarsen_graph.cu b/cpp/src/structure/coarsen_graph.cu
index cf40b0443d5..e56a3f5503f 100644
--- a/cpp/src/structure/coarsen_graph.cu
+++ b/cpp/src/structure/coarsen_graph.cu
@@ -142,8 +142,7 @@ decompress_matrix_partition_to_relabeled_and_grouped_and_coarsened_edgelist(
   AdjMatrixMinorLabelInputWrapper const minor_label_input,
   std::optional<std::vector<vertex_t>> const& segment_offsets)
 {
-  static_assert(
-    std::is_same_v<typename AdjMatrixMinorLabelInputWrapper::value_type, vertex_t>);
+  static_assert(std::is_same_v<typename AdjMatrixMinorLabelInputWrapper::value_type, vertex_t>);
 
   // FIXME: it might be possible to directly create relabled & coarsened edgelist from the
   // compressed sparse format to save memory
@@ -231,7 +230,6 @@ coarsen_graph(
     copy_to_adj_matrix_col(handle, graph_view, labels, adj_matrix_minor_labels);
   }
 
-
   std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_major_vertices{};
   std::vector<rmm::device_uvector<vertex_t>> coarsened_edgelist_minor_vertices{};
   auto coarsened_edgelist_weights =
diff --git a/cpp/src/traversal/sssp.cu b/cpp/src/traversal/sssp.cu
index c049d853867..4ca37b3bdaa 100644
--- a/cpp/src/traversal/sssp.cu
+++ b/cpp/src/traversal/sssp.cu
@@ -78,11 +78,9 @@ void sssp(raft::handle_t const& handle,
     auto num_negative_edge_weights =
       count_if_e(handle,
                  push_graph_view,
-    dummy_properties_t<vertex_t>{}.device_view(),
-    dummy_properties_t<vertex_t>{}.device_view(),
-                 [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) {
-                   return w < 0.0;
-                 });
+                 dummy_properties_t<vertex_t>{}.device_view(),
+                 dummy_properties_t<vertex_t>{}.device_view(),
+                 [] __device__(vertex_t, vertex_t, weight_t w, auto, auto) { return w < 0.0; });
     CUGRAPH_EXPECTS(num_negative_edge_weights == 0,
                     "Invalid input argument: input graph should have non-negative edge weights.");
   }
@@ -134,7 +132,9 @@ void sssp(raft::handle_t const& handle,
 
   // 5. SSSP iteration
 
-  auto adj_matrix_row_distances = GraphViewType::is_multi_gpu ? row_properties_t<GraphViewType, weight_t>(handle, push_graph_view) : row_properties_t<GraphViewType, weight_t>{};
+  auto adj_matrix_row_distances =
+    GraphViewType::is_multi_gpu ? row_properties_t<GraphViewType, weight_t>(handle, push_graph_view)
+                                : row_properties_t<GraphViewType, weight_t>{};
   if (GraphViewType::is_multi_gpu) {
     adj_matrix_row_distances.fill(std::numeric_limits<weight_t>::max(), handle.get_stream());
   }
@@ -145,14 +145,14 @@ void sssp(raft::handle_t const& handle,
 
   auto near_far_threshold = delta;
   while (true) {
-  if (GraphViewType::is_multi_gpu) {
-    copy_to_adj_matrix_row(
-      handle,
-      push_graph_view,
-      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
-      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
-      distances,
-      adj_matrix_row_distances);
+    if (GraphViewType::is_multi_gpu) {
+      copy_to_adj_matrix_row(
+        handle,
+        push_graph_view,
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).begin(),
+        vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur_near)).end(),
+        distances,
+        adj_matrix_row_distances);
     }
 
     auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
@@ -164,7 +164,9 @@ void sssp(raft::handle_t const& handle,
       vertex_frontier,
       static_cast<size_t>(Bucket::cur_near),
       std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
-      GraphViewType::is_multi_gpu ? adj_matrix_row_distances.device_view() : detail::major_properties_device_view_t<vertex_t, weight_t const*>(distances),
+      GraphViewType::is_multi_gpu
+        ? adj_matrix_row_distances.device_view()
+        : detail::major_properties_device_view_t<vertex_t, weight_t const*>(distances),
       dummy_properties_t<vertex_t>{}.device_view(),
       [vertex_partition, distances, cutoff] __device__(
         vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) {

From e2e4b1383dd80a4bde9a4125d82a7711b1a898e7 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 31 Aug 2021 16:48:53 -0400
Subject: [PATCH 11/57] replace rmm::exec_policy(hanlde.get_stream()) with
 handle.get_thrust_policy()

---
 cpp/src/structure/renumber_edgelist.cu  | 2 +-
 cpp/tests/sampling/rw_low_level_test.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist.cu b/cpp/src/structure/renumber_edgelist.cu
index 4123bb5f218..ecdb06e399a 100644
--- a/cpp/src/structure/renumber_edgelist.cu
+++ b/cpp/src/structure/renumber_edgelist.cu
@@ -550,7 +550,7 @@ void expensive_check_edgelist(
         for (int j = 0; j < row_comm_size; ++j) {
           CUGRAPH_EXPECTS(
             thrust::count_if(
-              rmm::exec_policy(handle.get_stream_view()),
+              handle.get_thrust_policy(),
               edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j],
               edgelist_minor_vertices[i] + (*edgelist_intra_partition_segment_offsets)[i][j + 1],
               [row_comm_size,
diff --git a/cpp/tests/sampling/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
index 0977d1031bf..3711fb3f98f 100644
--- a/cpp/tests/sampling/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -73,7 +73,7 @@ void next_biased(raft::handle_t const& handle,
                  vector_test_t<vertex_t>& d_next_v,
                  selector_t const& selector)
 {
-  thrust::transform(rmm::exec_policy(handle.get_stream_view()),
+  thrust::transform(handle.get_thrust_policy(),
                     d_src_v.begin(),
                     d_src_v.end(),
                     d_rnd.begin(),

From a35d137effd7d02c388903b3e04463361acf293e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 00:06:54 -0400
Subject: [PATCH 12/57] code refinements

---
 .../cugraph/prims/row_col_properties.cuh      | 17 +++--
 .../cugraph/utilities/dataframe_buffer.cuh    | 65 +++++++++++++++++++
 cpp/src/community/louvain.cuh                 | 49 +++-----------
 .../components/weakly_connected_components.cu |  2 +-
 4 files changed, 84 insertions(+), 49 deletions(-)

diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index adf068d2eb5..3aa14a2e859 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -48,7 +48,8 @@ class major_properties_device_view_t {
 
   ValueIterator value_data() const { return value_first_; }
 
-  __device__ auto get(vertex_t offset) const { return *(value_first_ + offset); }
+  __device__ ValueIterator get_iter(vertex_t offset) const { return value_first_ + offset; }
+  __device__ value_type get(vertex_t offset) const { return *get_iter(offset); }
 
  private:
   ValueIterator value_first_{};
@@ -73,7 +74,7 @@ class minor_properties_device_view_t {
   {
   }
 
-  __device__ auto& get(vertex_t offset) const
+  __device__ ValueIterator get_iter(vertex_t offset) const
   {
     auto value_offset = offset;
     if (key_first_) {
@@ -81,9 +82,11 @@ class minor_properties_device_view_t {
       assert((it != *key_last_) && (*it == offset));
       value_offset = static_cast<vertex_t>(thrust::distance(*key_first_, it));
     }
-    return *(value_first_ + value_offset);
+    return value_first_ + value_offset;
   }
 
+  __device__ value_type get(vertex_t offset) const { return *get_iter(offset); }
+
  private:
   thrust::optional<vertex_t const*> key_first_{thrust::nullopt};
   thrust::optional<vertex_t const*> key_last_{thrust::nullopt};
@@ -113,15 +116,13 @@ class major_properties_t {
 
   auto device_view() const
   {
-    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    auto value_first = get_dataframe_buffer_cbegin<T>(buffer_);
     return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
   }
 
   auto mutable_device_view()
   {
     auto value_first = get_dataframe_buffer_begin<T>(buffer_);
-    static_assert(
-      !std::is_const_v<typename std::iterator_traits<decltype(value_first)>::value_type>);
     return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
   }
 
@@ -168,7 +169,7 @@ class minor_properties_t {
 
   auto device_view() const
   {
-    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
+    auto value_first = get_dataframe_buffer_cbegin<T>(buffer_);
     if (key_first_) {
       return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
         *key_first_, *key_last_, value_first);
@@ -180,8 +181,6 @@ class minor_properties_t {
   auto mutable_device_view()
   {
     auto value_first = get_dataframe_buffer_begin<T>(buffer_);
-    static_assert(
-      !std::is_const_v<typename std::iterator_traits<decltype(value_first)>::value_type>);
     if (key_first_) {
       return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
         *key_first_, *key_last_, value_first);
diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.cuh b/cpp/include/cugraph/utilities/dataframe_buffer.cuh
index 04c5db91d89..28afadedf8e 100644
--- a/cpp/include/cugraph/utilities/dataframe_buffer.cuh
+++ b/cpp/include/cugraph/utilities/dataframe_buffer.cuh
@@ -92,6 +92,21 @@ auto get_dataframe_buffer_begin_tuple_impl(std::index_sequence<Is...>, BufferTyp
     get_dataframe_buffer_begin_tuple_element_impl<TupleType, Is>(buffer)...);
 }
 
+template <typename TupleType, size_t I, typename BufferType>
+auto get_dataframe_buffer_cbegin_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return std::get<I>(buffer).cbegin();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_dataframe_buffer_cbegin_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
+  return thrust::make_tuple(
+    get_dataframe_buffer_cbegin_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
 template <typename TupleType, size_t I, typename BufferType>
 auto get_dataframe_buffer_end_tuple_element_impl(BufferType& buffer)
 {
@@ -106,6 +121,20 @@ auto get_dataframe_buffer_end_tuple_impl(std::index_sequence<Is...>, BufferType&
   return thrust::make_tuple(get_dataframe_buffer_end_tuple_element_impl<TupleType, Is>(buffer)...);
 }
 
+template <typename TupleType, size_t I, typename BufferType>
+auto get_dataframe_buffer_cend_tuple_element_impl(BufferType& buffer)
+{
+  using element_t = typename thrust::tuple_element<I, TupleType>::type;
+  return std::get<I>(buffer).cend();
+}
+
+template <typename TupleType, size_t... Is, typename BufferType>
+auto get_dataframe_buffer_cend_tuple_impl(std::index_sequence<Is...>, BufferType& buffer)
+{
+  // thrust::make_tuple instead of std::make_tuple as this is fed to thrust::make_zip_iterator.
+  return thrust::make_tuple(get_dataframe_buffer_cend_tuple_element_impl<TupleType, Is>(buffer)...);
+}
+
 }  // namespace detail
 
 template <typename T, typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
@@ -200,6 +229,24 @@ auto get_dataframe_buffer_begin(BufferType& buffer)
     std::make_index_sequence<tuple_size>(), buffer));
 }
 
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_cbegin(BufferType& buffer)
+{
+  return buffer.cbegin();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_cbegin(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(detail::get_dataframe_buffer_cbegin_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer));
+}
+
 template <typename T,
           typename BufferType,
           typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
@@ -218,4 +265,22 @@ auto get_dataframe_buffer_end(BufferType& buffer)
     detail::get_dataframe_buffer_end_tuple_impl<T>(std::make_index_sequence<tuple_size>(), buffer));
 }
 
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<std::is_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_cend(BufferType& buffer)
+{
+  return buffer.cend();
+}
+
+template <typename T,
+          typename BufferType,
+          typename std::enable_if_t<is_thrust_tuple_of_arithmetic<T>::value>* = nullptr>
+auto get_dataframe_buffer_cend(BufferType& buffer)
+{
+  size_t constexpr tuple_size = thrust::tuple_size<T>::value;
+  return thrust::make_zip_iterator(detail::get_dataframe_buffer_cend_tuple_impl<T>(
+    std::make_index_sequence<tuple_size>(), buffer));
+}
+
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index aff64cad704..abcfe41d8b3 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -249,7 +249,7 @@ class Louvain {
       cluster_weights_v_ = std::move(rx_weights_v);
     }
 
-    if (graph_view_t::is_multi_gpu) {
+    if constexpr (graph_view_t::is_multi_gpu) {
       src_vertex_weights_cache_ =
         row_properties_t<graph_view_t, weight_t>(handle_, current_graph_view_);
       copy_to_adj_matrix_row(
@@ -261,20 +261,6 @@ class Louvain {
     timer_stop(handle_.get_stream_view());
   }
 
-  template <typename T>
-  void cache_src_properties(rmm::device_uvector<T>& input,
-                            row_properties_t<graph_view_t, T>& src_cache_)
-  {
-    copy_to_adj_matrix_row(handle_, current_graph_view_, input.begin(), src_cache_);
-  }
-
-  template <typename T>
-  void cache_dst_properties(rmm::device_uvector<T>& input,
-                            col_properties_t<graph_view_t, T>& dst_cache_)
-  {
-    copy_to_adj_matrix_col(handle_, current_graph_view_, input.begin(), dst_cache_);
-  }
-
   virtual weight_t update_clustering(weight_t total_edge_weight, weight_t resolution)
   {
     timer_start("update_clustering");
@@ -289,9 +275,11 @@ class Louvain {
 
     if constexpr (graph_view_t::is_multi_gpu) {
       src_clusters_cache_ = row_properties_t<graph_view_t, vertex_t>(handle_, current_graph_view_);
-      cache_src_properties(next_clusters_v_, src_clusters_cache_);
+      copy_to_adj_matrix_row(
+        handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_);
       dst_clusters_cache_ = col_properties_t<graph_view_t, vertex_t>(handle_, current_graph_view_);
-      cache_dst_properties(next_clusters_v_, dst_clusters_cache_);
+      copy_to_adj_matrix_col(
+        handle_, current_graph_view_, next_clusters_v_.begin(), dst_clusters_cache_);
     }
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
@@ -418,7 +406,7 @@ class Louvain {
                              src_old_cluster_sum_subtract_pairs);
     }
 
-    auto output_buffer = cugraph::allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
+    auto output_buffer = allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
       current_graph_view_.get_number_of_local_vertices(), handle_.get_stream());
 
     auto cluster_old_sum_subtract_pair_first = thrust::make_zip_iterator(
@@ -443,39 +431,21 @@ class Louvain {
     copy_v_transform_reduce_key_aggregated_out_nbr(
       handle_,
       current_graph_view_,
-#if 1
       zipped_src_device_view,
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
         : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
-#else
-      thrust::make_zip_iterator(thrust::make_tuple(src_old_cluster_sum_v.begin(),
-                                                   d_src_vertex_weights_cache_,
-                                                   src_cluster_subtract_v.begin(),
-                                                   d_src_cluster_cache_,
-                                                   src_cluster_weights_v.begin())),
-
-      d_dst_cluster_cache_,
-#endif
       cluster_keys_v_.begin(),
       cluster_keys_v_.end(),
       cluster_weights_v_.begin(),
       [total_edge_weight, resolution] __device__(
         auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
-#if 1
         auto k_k              = thrust::get<0>(src_info);
         auto src_cluster      = thrust::get<1>(src_info);
         auto a_old            = thrust::get<2>(src_info);
         auto old_cluster_sum  = thrust::get<3>(src_info);
         auto cluster_subtract = thrust::get<4>(src_info);
-#else
-        auto old_cluster_sum  = thrust::get<0>(src_info);
-        auto k_k              = thrust::get<1>(src_info);
-        auto cluster_subtract = thrust::get<2>(src_info);
-        auto src_cluster      = thrust::get<3>(src_info);
-        auto a_old            = thrust::get<4>(src_info);
-#endif
 
         if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
 
@@ -512,8 +482,10 @@ class Louvain {
       });
 
     if constexpr (graph_view_t::is_multi_gpu) {
-      cache_src_properties(next_clusters_v_, src_clusters_cache_);
-      cache_dst_properties(next_clusters_v_, dst_clusters_cache_);
+      copy_to_adj_matrix_row(
+        handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_);
+      copy_to_adj_matrix_row(
+        handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_);
     }
 
     std::tie(cluster_keys_v_, cluster_weights_v_) =
@@ -569,7 +541,6 @@ class Louvain {
   std::unique_ptr<graph_t> current_graph_{};
   graph_view_t current_graph_view_;
 
-  // FIXME: better move inside the update_by_delta_modularity?
   rmm::device_uvector<vertex_t> cluster_keys_v_;
   rmm::device_uvector<weight_t> cluster_weights_v_;
 
diff --git a/cpp/src/components/weakly_connected_components.cu b/cpp/src/components/weakly_connected_components.cu
index 35f4343f721..0bc1bd4996b 100644
--- a/cpp/src/components/weakly_connected_components.cu
+++ b/cpp/src/components/weakly_connected_components.cu
@@ -555,7 +555,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
           // FIXME: better switch to atomic_ref after
           // https://github.com/nvidia/libcudacxx/milestone/2
           auto old = atomicCAS(
-            &(col_components.get(col_offset)), invalid_component_id<vertex_t>::value, tag);
+            col_components.get_iter(col_offset), invalid_component_id<vertex_t>::value, tag);
           if (old != invalid_component_id<vertex_t>::value && old != tag) {  // conflict
             static_assert(sizeof(unsigned long long int) == sizeof(size_t));
             auto edge_idx = atomicAdd(reinterpret_cast<unsigned long long int*>(num_edge_inserts),

From 83d7313804124aa045e35cd526f234b621f7fcda Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 10:18:56 -0400
Subject: [PATCH 13/57] code clean-up

---
 cpp/include/cugraph/graph_view.hpp                 | 14 ++++----------
 ...y_v_transform_reduce_key_aggregated_out_nbr.cuh |  1 +
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 3cab3b7ff8f..81aa00fd2ea 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -776,22 +776,22 @@ class graph_view_t<vertex_t,
 
   std::optional<vertex_t const*> get_local_sorted_unique_edge_row_begin() const
   {
-    return local_sorted_unique_edge_row_first_;
+    return std::nullopt;
   }
 
   std::optional<vertex_t const*> get_local_sorted_unique_edge_row_end() const
   {
-    return local_sorted_unique_edge_row_last_;
+    return std::nullopt;
   }
 
   std::optional<vertex_t const*> get_local_sorted_unique_edge_col_begin() const
   {
-    return local_sorted_unique_edge_col_first_;
+    return std::nullopt;
   }
 
   std::optional<vertex_t const*> get_local_sorted_unique_edge_col_end() const
   {
-    return local_sorted_unique_edge_col_last_;
+    return std::nullopt;
   }
 
  private:
@@ -801,12 +801,6 @@ class graph_view_t<vertex_t,
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets_{std::nullopt};
-
-  // FIXME: to be implemented.
-  std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{std::nullopt};
-  std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{std::nullopt};
-  std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{std::nullopt};
-  std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{std::nullopt};
 };
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index f34dc0af660..0ed211b9002 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -425,6 +425,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
         matrix_partition.get_indices(),
         detail::minor_to_key_t<AdjMatrixColKeyInputWrapper>{adj_matrix_col_key_input,
                                                             matrix_partition.get_minor_first()});
+      auto execution_policy = handle.get_thrust_policy();
       thrust::copy(execution_policy,
                    minor_key_first,
                    minor_key_first + matrix_partition.get_number_of_edges(),

From 92972ed82bcc869c5acf3e053b20a78b608f6808 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 10:24:43 -0400
Subject: [PATCH 14/57] clang-format

---
 .../prims/copy_to_adj_matrix_row_col.cuh      | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 37456c4b8bf..183eb38f944 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -440,7 +440,8 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU).
+ * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties
+ * (for the rows assigned to this process in multi-GPU).
  */
 template <typename GraphViewType, typename VertexValueInputIterator>
 void copy_to_adj_matrix_row(
@@ -452,11 +453,9 @@ void copy_to_adj_matrix_row(
     adj_matrix_row_value_output)
 {
   if constexpr (GraphViewType::is_adj_matrix_transposed) {
-    copy_to_matrix_minor(
-      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
+    copy_to_matrix_minor(handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
   } else {
-    copy_to_matrix_major(
-      handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
+    copy_to_matrix_major(handle, graph_view, vertex_value_input_first, adj_matrix_row_value_output);
   }
 }
 
@@ -481,7 +480,8 @@ void copy_to_adj_matrix_row(
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties (for the rows assigned to this process in multi-GPU).
+ * @param adj_matrix_row_value_output Wrapper used to access data storage to copy row properties
+ * (for the rows assigned to this process in multi-GPU).
  */
 template <typename GraphViewType, typename VertexIterator, typename VertexValueInputIterator>
 void copy_to_adj_matrix_row(
@@ -525,7 +525,8 @@ void copy_to_adj_matrix_row(
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU).
+ * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties
+ * (for the columns assigned to this process in multi-GPU).
  */
 template <typename GraphViewType, typename VertexValueInputIterator>
 void copy_to_adj_matrix_col(
@@ -537,11 +538,9 @@ void copy_to_adj_matrix_col(
     adj_matrix_col_value_output)
 {
   if constexpr (GraphViewType::is_adj_matrix_transposed) {
-    copy_to_matrix_major(
-      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
+    copy_to_matrix_major(handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
   } else {
-    copy_to_matrix_minor(
-      handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
+    copy_to_matrix_minor(handle, graph_view, vertex_value_input_first, adj_matrix_col_value_output);
   }
 }
 
@@ -566,7 +565,8 @@ void copy_to_adj_matrix_col(
  * @param vertex_value_input_first Iterator pointing to the vertex properties for the first
  * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_input_last` (exclusive)
  * is deduced as @p vertex_value_input_first + @p graph_view.get_number_of_local_vertices().
- * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties (for the columns assigned to this process in multi-GPU).
+ * @param adj_matrix_col_value_output Wrapper used to access data storage to copy column properties
+ * (for the columns assigned to this process in multi-GPU).
  */
 template <typename GraphViewType, typename VertexIterator, typename VertexValueInputIterator>
 void copy_to_adj_matrix_col(

From f39d26672b9ba37a412a8a1c6dbec616388a9927 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 11:32:42 -0400
Subject: [PATCH 15/57] documentation update

---
 cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index 183eb38f944..ab27e7cc3c7 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -429,8 +429,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
  * @brief Copy vertex property values to the corresponding graph adjacency matrix row property
  * variables.
  *
- * This version fills the entire set of graph adjacency matrix row property values. This function is
- * inspired by thrust::copy().
+ * This version fills the entire set of graph adjacency matrix row property values.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexValueInputIterator Type of the iterator for vertex properties.
@@ -465,7 +464,7 @@ void copy_to_adj_matrix_row(
  *
  * This version fills only a subset of graph adjacency matrix row property values. [@p vertex_first,
  * @p vertex_last) specifies the vertices with new values to be copied to graph adjacency matrix row
- * property variables. This function is inspired by thrust::copy().
+ * property variables.
  *
  * @tparam GraphViewType Type of the passed non-owning graph object.
  * @tparam VertexIterator  Type of the iterator for vertex identifiers.

From 06c3fa9b64ac75d0352471e645a05b49d8e4e0e6 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 14:00:03 -0400
Subject: [PATCH 16/57] bug fixes

---
 cpp/src/community/louvain.cuh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index abcfe41d8b3..ea8b753494f 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -378,6 +378,7 @@ class Louvain {
                           cluster_keys_v_.end(),
                           cluster_weights_v_.begin());
 
+      vertex_cluster_weights_v.resize(next_clusters_v_.size(), handle_.get_stream());
       thrust::transform(handle_.get_thrust_policy(),
                         next_clusters_v_.begin(),
                         next_clusters_v_.end(),
@@ -494,7 +495,10 @@ class Louvain {
         current_graph_view_,
         dummy_properties_t<vertex_t>{}.device_view(),
         dummy_properties_t<vertex_t>{}.device_view(),
-        src_clusters_cache_.device_view(),
+        graph_view_t::is_multi_gpu
+          ? src_clusters_cache_.device_view()
+          : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+              next_clusters_v_.data()),
         [] __device__(auto, auto, auto wt, auto, auto) { return wt; },
         weight_t{0});
   }

From 6fde1c0ac7dec5a638c4f72760fea882d68ce9ae Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 1 Sep 2021 15:55:54 -0400
Subject: [PATCH 17/57] additional bug fix

---
 cpp/src/structure/create_graph_from_edgelist.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/structure/create_graph_from_edgelist.cpp b/cpp/src/structure/create_graph_from_edgelist.cpp
index d3a385b05bf..c9181f1e000 100644
--- a/cpp/src/structure/create_graph_from_edgelist.cpp
+++ b/cpp/src/structure/create_graph_from_edgelist.cpp
@@ -113,8 +113,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
              *vertex_partition_segment_offsets) =
       cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
-        std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-          (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))},
+        local_vertex_span
+          ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
+              (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))}
+          : std::nullopt,
         major_ptrs,
         minor_ptrs,
         edgelist_edge_counts,

From 9d25030d9b91491d18794a7a6fa7a56d9ca7afb1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 10:05:12 -0400
Subject: [PATCH 18/57] MG WCC bug fix

---
 cpp/src/structure/create_graph_from_edgelist.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/structure/create_graph_from_edgelist.cpp b/cpp/src/structure/create_graph_from_edgelist.cpp
index d3a385b05bf..c9181f1e000 100644
--- a/cpp/src/structure/create_graph_from_edgelist.cpp
+++ b/cpp/src/structure/create_graph_from_edgelist.cpp
@@ -113,8 +113,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
              *vertex_partition_segment_offsets) =
       cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
-        std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-          (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))},
+        local_vertex_span
+          ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
+              (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))}
+          : std::nullopt,
         major_ptrs,
         minor_ptrs,
         edgelist_edge_counts,

From dafa4ed3c0b6ac373416e7cfc46698a1eba1464a Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 10:52:25 -0400
Subject: [PATCH 19/57] device lambda to struct functor

---
 ...ransform_reduce_key_aggregated_out_nbr.cuh | 72 +++++++++++++------
 cpp/src/community/louvain.cuh                 | 49 ++++++++-----
 2 files changed, 85 insertions(+), 36 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 0ed211b9002..af25fae7234 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -46,12 +46,51 @@ struct minor_to_key_t {
   using vertex_t = typename AdjMatrixColKeyInputWrapper::value_type;
   AdjMatrixColKeyInputWrapper adj_matrix_col_key_input{};
   vertex_t minor_first{};
-  __device__ vertex_t operator()(vertex_t minor)
+  __device__ vertex_t operator()(vertex_t minor) const
   {
     return adj_matrix_col_key_input.get(minor - minor_first);
   }
 };
 
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct minor_key_to_col_rank_t {
+  compute_gpu_id_from_vertex_t<vertex_t> key_func{};
+  int row_comm_size{};
+  __device__ int operator()(
+    thrust::tuple<vertex_t, vertex_t, weight_t> val /* major, minor key, weight */) const
+  {
+    return key_func(thrust::get<1>(val)) / row_comm_size;
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t,
+          typename weight_t,
+          typename AdjMatrixRowValueInputWrapper,
+          typename KeyAggregatedEdgeOp,
+          typename MatrixPartitionDeviceView,
+          typename StaticMapDeviceView>
+struct call_key_aggregated_e_op_t {
+  AdjMatrixRowValueInputWrapper matrix_partition_row_value_input{};
+  KeyAggregatedEdgeOp key_aggregated_e_op{};
+  MatrixPartitionDeviceView matrix_partition{};
+  StaticMapDeviceView kv_map{};
+  __device__ auto operator()(
+    thrust::tuple<vertex_t, vertex_t, weight_t> val /* major, minor key, weight */) const
+  {
+    auto major = thrust::get<0>(val);
+    auto key   = thrust::get<1>(val);
+    auto w     = thrust::get<2>(val);
+    return key_aggregated_e_op(major,
+                               key,
+                               w,
+                               matrix_partition_row_value_input.get(
+                                 matrix_partition.get_major_offset_from_major_nocheck(major)),
+                               kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
+  }
+};
+
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 __global__ void for_all_major_for_all_nbr_mid_degree(
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
@@ -512,10 +551,8 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
           col_comm,
           triplet_first,
           triplet_first + tmp_major_vertices.size(),
-          [key_func = detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size},
-           row_comm_size] __device__(auto val) {
-            return key_func(thrust::get<1>(val)) / row_comm_size;
-          },
+          detail::minor_key_to_col_rank_t<vertex_t, weight_t>{
+            detail::compute_gpu_id_from_vertex_t<vertex_t>{comm_size}, row_comm_size},
           handle.get_stream());
 
       auto pair_first = thrust::make_zip_iterator(
@@ -558,21 +595,16 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                       triplet_first,
                       triplet_first + tmp_major_vertices.size(),
                       tmp_e_op_result_buffer_first,
-                      [matrix_partition_row_value_input,
-                       key_aggregated_e_op,
-                       matrix_partition,
-                       kv_map = kv_map_ptr->get_device_view()] __device__(auto val) {
-                        auto major = thrust::get<0>(val);
-                        auto key   = thrust::get<1>(val);
-                        auto w     = thrust::get<2>(val);
-                        return key_aggregated_e_op(
-                          major,
-                          key,
-                          w,
-                          matrix_partition_row_value_input.get(
-                            matrix_partition.get_major_offset_from_major_nocheck(major)),
-                          kv_map.find(key)->second.load(cuda::std::memory_order_relaxed));
-                      });
+                      detail::call_key_aggregated_e_op_t<vertex_t,
+                                                         weight_t,
+                                                         AdjMatrixRowValueInputWrapper,
+                                                         KeyAggregatedEdgeOp,
+                                                         decltype(matrix_partition),
+                                                         decltype(kv_map_ptr->get_device_view())>{
+                        matrix_partition_row_value_input,
+                        key_aggregated_e_op,
+                        matrix_partition,
+                        kv_map_ptr->get_device_view()});
     tmp_minor_keys.resize(0, handle.get_stream());
     tmp_key_aggregated_edge_weights.resize(0, handle.get_stream());
     tmp_minor_keys.shrink_to_fit(handle.get_stream());
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index ea8b753494f..a2e448e85a0 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -43,6 +43,38 @@
 
 namespace cugraph {
 
+namespace detail {
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct key_aggregated_edge_op_t {
+  weight_t total_edge_weight{};
+  weight_t resolution{};
+  __device__ auto operator()(
+    vertex_t src,
+    vertex_t neighbor_cluster,
+    weight_t new_cluster_sum,
+    thrust::tuple<weight_t, vertex_t, weight_t, weight_t, weight_t> src_info,
+    weight_t a_new) const
+  {
+    auto k_k              = thrust::get<0>(src_info);
+    auto src_cluster      = thrust::get<1>(src_info);
+    auto a_old            = thrust::get<2>(src_info);
+    auto old_cluster_sum  = thrust::get<3>(src_info);
+    auto cluster_subtract = thrust::get<4>(src_info);
+
+    if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
+
+    weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
+                                     resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
+                                       (total_edge_weight * total_edge_weight));
+
+    return thrust::make_tuple(neighbor_cluster, delta_modularity);
+  }
+};
+
+}  // namespace detail
+
 template <typename graph_view_type>
 class Louvain {
  public:
@@ -440,22 +472,7 @@ class Louvain {
       cluster_keys_v_.begin(),
       cluster_keys_v_.end(),
       cluster_weights_v_.begin(),
-      [total_edge_weight, resolution] __device__(
-        auto src, auto neighbor_cluster, auto new_cluster_sum, auto src_info, auto a_new) {
-        auto k_k              = thrust::get<0>(src_info);
-        auto src_cluster      = thrust::get<1>(src_info);
-        auto a_old            = thrust::get<2>(src_info);
-        auto old_cluster_sum  = thrust::get<3>(src_info);
-        auto cluster_subtract = thrust::get<4>(src_info);
-
-        if (src_cluster == neighbor_cluster) new_cluster_sum -= cluster_subtract;
-
-        weight_t delta_modularity = 2 * (((new_cluster_sum - old_cluster_sum) / total_edge_weight) -
-                                         resolution * (a_new * k_k - a_old * k_k + k_k * k_k) /
-                                           (total_edge_weight * total_edge_weight));
-
-        return thrust::make_tuple(neighbor_cluster, delta_modularity);
-      },
+      detail::key_aggregated_edge_op_t<vertex_t, weight_t>{total_edge_weight, resolution},
       [] __device__(auto p1, auto p2) {
         auto id1 = thrust::get<0>(p1);
         auto id2 = thrust::get<0>(p2);

From 0734f2a08cb45b8d839187d3cb0f7bb0876fd2e9 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 16:08:32 -0400
Subject: [PATCH 20/57] cleanup multi-source BFS artifacts

---
 cpp/include/cugraph/algorithms.hpp  |  3 +-
 cpp/src/traversal/bfs.cu            | 50 ++++++++---------------------
 cpp/tests/traversal/mg_bfs_test.cpp | 13 ++++++--
 3 files changed, 25 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index 7d405d324fc..d719abaaa4b 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1143,7 +1143,6 @@ weight_t hungarian(raft::handle_t const& handle,
  * @param predecessors Pointer to the output predecessor array or `nullptr`.
  * @param sources Source vertices to start breadth-first search (root vertex of the breath-first
  * search tree). If more than one source is passed, there must be a single source per component.
- * Device memory and host memory are accepted.
  * @param n_sources number of sources (one source per component at most).
  * @param direction_optimizing If set to true, this algorithm switches between the push based
  * breadth-first search and pull based breadth-first search depending on the size of the
@@ -1158,7 +1157,7 @@ void bfs(raft::handle_t const& handle,
          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
          vertex_t* distances,
          vertex_t* predecessors,
-         vertex_t* sources,
+         vertex_t* const sources,
          size_t n_sources          = 1,
          bool direction_optimizing = false,
          vertex_t depth_limit      = std::numeric_limits<vertex_t>::max(),
diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu
index 3cf97399b51..abe90d8cc7e 100644
--- a/cpp/src/traversal/bfs.cu
+++ b/cpp/src/traversal/bfs.cu
@@ -21,7 +21,6 @@
 #include <cugraph/prims/update_frontier_v_push_if_out_nbr.cuh>
 #include <cugraph/prims/vertex_frontier.cuh>
 #include <cugraph/utilities/error.hpp>
-#include <cugraph/vertex_partition_device.cuh>
 #include <cugraph/vertex_partition_device_view.cuh>
 
 #include <raft/handle.hpp>
@@ -47,7 +46,7 @@ void bfs(raft::handle_t const& handle,
          GraphViewType const& push_graph_view,
          typename GraphViewType::vertex_type* distances,
          PredecessorIterator predecessor_first,
-         typename GraphViewType::vertex_type* sources,
+         typename GraphViewType::vertex_type* const sources,
          size_t n_sources,
          bool direction_optimizing,
          typename GraphViewType::vertex_type depth_limit,
@@ -62,7 +61,11 @@ void bfs(raft::handle_t const& handle,
 
   auto const num_vertices = push_graph_view.get_number_of_vertices();
   if (num_vertices == 0) { return; }
-  // CUGRAPH_EXPECTS(sources != nullptr, "Invalid input argument: sources cannot be null");
+
+  // 1. check input arguments
+
+  CUGRAPH_EXPECTS((n_sources == 0) || (sources != nullptr),
+                  "Invalid input argument: sources cannot be null");
 
   auto aggregate_n_sources =
     GraphViewType::is_multi_gpu
@@ -71,31 +74,18 @@ void bfs(raft::handle_t const& handle,
   CUGRAPH_EXPECTS(aggregate_n_sources > 0,
                   "Invalid input argument: input should have at least one source");
 
-  // 1. check input arguments
   CUGRAPH_EXPECTS(
     push_graph_view.is_symmetric() || !direction_optimizing,
     "Invalid input argument: input graph should be symmetric for direction optimizing BFS.");
 
-  // Transfer single source to the device for single source case
-  vertex_t* d_sources = sources;
-  rmm::device_uvector<vertex_t> d_sources_v(0, handle.get_stream());
-  if (aggregate_n_sources == 1 && n_sources) {
-    cudaPointerAttributes s_att;
-    CUDA_CHECK(cudaPointerGetAttributes(&s_att, sources));
-    if (s_att.devicePointer == nullptr) {
-      d_sources_v.resize(n_sources, handle.get_stream());
-      d_sources = d_sources_v.data();
-      raft::copy(d_sources, sources, n_sources, handle.get_stream());
-    }
-  }
-
   if (do_expensive_check) {
-    vertex_partition_device_t<GraphViewType> vertex_partition(push_graph_view);
+    auto vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
+      push_graph_view.get_vertex_partition_view());
     auto num_invalid_vertices =
       count_if_v(handle,
                  push_graph_view,
-                 d_sources,
-                 d_sources + n_sources,
+                 sources,
+                 sources + n_sources,
                  [vertex_partition] __device__(auto val) {
                    return !(vertex_partition.is_valid_vertex(val) &&
                             vertex_partition.is_local_vertex_nocheck(val));
@@ -122,8 +112,8 @@ void bfs(raft::handle_t const& handle,
   if (n_sources) {
     thrust::for_each(
       rmm::exec_policy(handle.get_thrust_policy()),
-      d_sources,
-      d_sources + n_sources,
+      sources,
+      sources + n_sources,
       [vertex_partition, distances, predecessor_first] __device__(auto v) {
         *(distances + vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v)) =
           vertex_t{0};
@@ -138,20 +128,8 @@ void bfs(raft::handle_t const& handle,
                  static_cast<size_t>(Bucket::num_buckets)>
     vertex_frontier(handle);
 
-  // insert local source(s) in the bucket
-  if (aggregate_n_sources == 1) {
-    vertex_t src;
-    // FIXME: this (cheap) transfer could be skiped when is_local_vertex_nocheck accpets device mem
-    raft::copy(&src, sources, n_sources, handle.get_stream());
-    if (push_graph_view.is_local_vertex_nocheck(src)) {
-      vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
-        .insert(d_sources, d_sources + n_sources);
-    }
-  } else {
-    // pre-shuffled
-    vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur))
-      .insert(d_sources, d_sources + n_sources);
-  }
+  vertex_frontier.get_bucket(static_cast<size_t>(Bucket::cur)).insert(sources, sources + n_sources);
+
   // 4. BFS iteration
   vertex_t depth{0};
   while (true) {
diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index e2b2100efad..97f95507205 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -114,12 +114,17 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
       hr_clock.start();
     }
 
+    auto d_mg_source =
+      mg_graph_view.is_local_vertex_nocheck(bfs_usecase.source)
+        ? std::make_optional<rmm::device_scalar<vertex_t>>(bfs_usecase.source, handle.get_stream())
+        : std::nullopt;
+
     cugraph::bfs(handle,
                  mg_graph_view,
                  d_mg_distances.data(),
                  d_mg_predecessors.data(),
-                 static_cast<vertex_t>(bfs_usecase.source),
-                 false,
+                 d_mg_source ? (*d_mg_source).data() : static_cast<vertex_t const*>(nullptr),
+                 d_mg_source ? size_t{1} : size_t{0},
                  std::numeric_limits<vertex_t>::max());
 
     if (cugraph::test::g_perf) {
@@ -191,11 +196,13 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
                           handle.get_stream());
         handle.get_stream_view().synchronize();
 
+        rmm::device_scalar<vertex_t> d_sg_source(unrenumbered_source, handle.get_stream());
         cugraph::bfs(handle,
                      sg_graph_view,
                      d_sg_distances.data(),
                      d_sg_predecessors.data(),
-                     unrenumbered_source,
+                     d_sg_source.data(),
+                     size_t{1},
                      false,
                      std::numeric_limits<vertex_t>::max());
         // 4-5. compare

From e7e3db1dbd59bc705668c7dcab240f0730919f23 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 16:13:56 -0400
Subject: [PATCH 21/57] add missing const

---
 cpp/src/traversal/bfs.cu | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/src/traversal/bfs.cu b/cpp/src/traversal/bfs.cu
index abe90d8cc7e..fa562e38922 100644
--- a/cpp/src/traversal/bfs.cu
+++ b/cpp/src/traversal/bfs.cu
@@ -46,7 +46,7 @@ void bfs(raft::handle_t const& handle,
          GraphViewType const& push_graph_view,
          typename GraphViewType::vertex_type* distances,
          PredecessorIterator predecessor_first,
-         typename GraphViewType::vertex_type* const sources,
+         typename GraphViewType::vertex_type const* sources,
          size_t n_sources,
          bool direction_optimizing,
          typename GraphViewType::vertex_type depth_limit,
@@ -195,7 +195,7 @@ void bfs(raft::handle_t const& handle,
          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
          vertex_t* distances,
          vertex_t* predecessors,
-         vertex_t* sources,
+         vertex_t const* sources,
          size_t n_sources,
          bool direction_optimizing,
          vertex_t depth_limit,
@@ -230,7 +230,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int32_t, float, false, true> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -240,7 +240,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int32_t, double, false, true> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -250,7 +250,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int64_t, float, false, true> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -260,7 +260,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int64_t, double, false, true> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -270,7 +270,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int64_t, int64_t, float, false, true> const& graph_view,
                   int64_t* distances,
                   int64_t* predecessors,
-                  int64_t* sources,
+                  int64_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int64_t depth_limit,
@@ -280,7 +280,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int64_t, int64_t, double, false, true> const& graph_view,
                   int64_t* distances,
                   int64_t* predecessors,
-                  int64_t* sources,
+                  int64_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int64_t depth_limit,
@@ -290,7 +290,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int32_t, float, false, false> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -300,7 +300,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int32_t, double, false, false> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -310,7 +310,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int64_t, float, false, false> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -320,7 +320,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int32_t, int64_t, double, false, false> const& graph_view,
                   int32_t* distances,
                   int32_t* predecessors,
-                  int32_t* sources,
+                  int32_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int32_t depth_limit,
@@ -330,7 +330,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int64_t, int64_t, float, false, false> const& graph_view,
                   int64_t* distances,
                   int64_t* predecessors,
-                  int64_t* sources,
+                  int64_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int64_t depth_limit,
@@ -340,7 +340,7 @@ template void bfs(raft::handle_t const& handle,
                   graph_view_t<int64_t, int64_t, double, false, false> const& graph_view,
                   int64_t* distances,
                   int64_t* predecessors,
-                  int64_t* sources,
+                  int64_t const* sources,
                   size_t n_sources,
                   bool direction_optimizing,
                   int64_t depth_limit,

From 559d270e985687e8f2a5628406568d7345863542 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 16:33:38 -0400
Subject: [PATCH 22/57] additional fixes

---
 cpp/include/cugraph/algorithms.hpp  |  2 +-
 cpp/tests/traversal/bfs_test.cpp    |  7 ++++---
 cpp/tests/traversal/mg_bfs_test.cpp |  4 ++--
 cpp/tests/traversal/ms_bfs_test.cu  | 10 ++++++----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cugraph/algorithms.hpp b/cpp/include/cugraph/algorithms.hpp
index d719abaaa4b..2c2f64217f8 100644
--- a/cpp/include/cugraph/algorithms.hpp
+++ b/cpp/include/cugraph/algorithms.hpp
@@ -1157,7 +1157,7 @@ void bfs(raft::handle_t const& handle,
          graph_view_t<vertex_t, edge_t, weight_t, false, multi_gpu> const& graph_view,
          vertex_t* distances,
          vertex_t* predecessors,
-         vertex_t* const sources,
+         vertex_t const* sources,
          size_t n_sources          = 1,
          bool direction_optimizing = false,
          vertex_t depth_limit      = std::numeric_limits<vertex_t>::max(),
diff --git a/cpp/tests/traversal/bfs_test.cpp b/cpp/tests/traversal/bfs_test.cpp
index 04b41db9b9b..3f1e18d8cbf 100644
--- a/cpp/tests/traversal/bfs_test.cpp
+++ b/cpp/tests/traversal/bfs_test.cpp
@@ -27,6 +27,7 @@
 
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 
@@ -133,14 +134,14 @@ class Tests_BFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, input_
       hr_clock.start();
     }
 
-    vertex_t source = bfs_usecase.source;
+    rmm::device_scalar<vertex_t> const d_source(bfs_usecase.source, handle.get_stream());
 
     cugraph::bfs(handle,
                  graph_view,
                  d_distances.data(),
                  d_predecessors.data(),
-                 &source,
-                 1,
+                 d_source.data(),
+                 size_t{1},
                  false,
                  std::numeric_limits<vertex_t>::max());
 
diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index 97f95507205..2a9d33fa6d5 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -114,7 +114,7 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
       hr_clock.start();
     }
 
-    auto d_mg_source =
+    auto const d_mg_source =
       mg_graph_view.is_local_vertex_nocheck(bfs_usecase.source)
         ? std::make_optional<rmm::device_scalar<vertex_t>>(bfs_usecase.source, handle.get_stream())
         : std::nullopt;
@@ -196,7 +196,7 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
                           handle.get_stream());
         handle.get_stream_view().synchronize();
 
-        rmm::device_scalar<vertex_t> d_sg_source(unrenumbered_source, handle.get_stream());
+        rmm::device_scalar<vertex_t> const d_sg_source(unrenumbered_source, handle.get_stream());
         cugraph::bfs(handle,
                      sg_graph_view,
                      d_sg_distances.data(),
diff --git a/cpp/tests/traversal/ms_bfs_test.cu b/cpp/tests/traversal/ms_bfs_test.cu
index 98a9d613c08..b1833c3f295 100644
--- a/cpp/tests/traversal/ms_bfs_test.cu
+++ b/cpp/tests/traversal/ms_bfs_test.cu
@@ -180,12 +180,13 @@ class Tests_MsBfs : public ::testing::TestWithParam<MsBfs_Usecase> {
     bool direction_optimizing = false;
 
     vertex_t source = h_sources[0];
+    rmm::device_scalar<vertex_t> const d_source_0(source, handle.get_stream());
     cugraph::bfs(handle,
                  graph_view,
                  d_distances_ref[0].begin(),
                  d_predecessors_ref[0].begin(),
-                 &source,
-                 1,
+                 d_source_0.data(),
+                 size_t{1},
                  direction_optimizing,
                  configuration.radius);
 
@@ -195,12 +196,13 @@ class Tests_MsBfs : public ::testing::TestWithParam<MsBfs_Usecase> {
     cudaProfilerStart();
     for (size_t i = 0; i < h_sources.size(); i++) {
       source = h_sources[i];
+      rmm::device_scalar<vertex_t> const d_source_i(source, handle.get_stream());
       cugraph::bfs(handle,
                    graph_view,
                    d_distances_ref[i].begin(),
                    d_predecessors_ref[i].begin(),
-                   &source,
-                   1,
+                   d_source_i.data(),
+                   size_t{1},
                    direction_optimizing,
                    configuration.radius);
     }

From 3248c7e89cf7ae3bb6536d81ad38c4e67742e46e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 2 Sep 2021 16:45:14 -0400
Subject: [PATCH 23/57] more fixes

---
 cpp/tests/traversal/mg_bfs_test.cpp |  1 +
 cpp/tests/traversal/ms_bfs_test.cu  | 16 ++++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp
index 2a9d33fa6d5..1a938edeee9 100644
--- a/cpp/tests/traversal/mg_bfs_test.cpp
+++ b/cpp/tests/traversal/mg_bfs_test.cpp
@@ -125,6 +125,7 @@ class Tests_MGBFS : public ::testing::TestWithParam<std::tuple<BFS_Usecase, inpu
                  d_mg_predecessors.data(),
                  d_mg_source ? (*d_mg_source).data() : static_cast<vertex_t const*>(nullptr),
                  d_mg_source ? size_t{1} : size_t{0},
+                 false,
                  std::numeric_limits<vertex_t>::max());
 
     if (cugraph::test::g_perf) {
diff --git a/cpp/tests/traversal/ms_bfs_test.cu b/cpp/tests/traversal/ms_bfs_test.cu
index b1833c3f295..94776689929 100644
--- a/cpp/tests/traversal/ms_bfs_test.cu
+++ b/cpp/tests/traversal/ms_bfs_test.cu
@@ -23,19 +23,23 @@
 #include <cugraph/graph_generators.hpp>
 #include <cugraph/graph_view.hpp>
 
-#include <cuda_profiler_api.h>
-#include <gtest/gtest.h>
 #include <raft/cudart_utils.h>
+#include <raft/handle.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+
+#include <cuda_profiler_api.h>
 #include <thrust/extrema.h>
 #include <thrust/sequence.h>
 #include <thrust/transform.h>
+
+#include <gtest/gtest.h>
+
 #include <algorithm>
 #include <iostream>
 #include <limits>
-#include <raft/handle.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <tuple>
 #include <utilities/thrust_wrapper.hpp>
 #include <vector>

From ee07b2bb537c3261caf179b80dd3d08fb4b49973 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 3 Sep 2021 23:31:16 -0400
Subject: [PATCH 24/57] additional bug fixes

---
 ...ransform_reduce_key_aggregated_out_nbr.cuh | 116 ++++++++++++------
 cpp/src/community/louvain.cuh                 |  64 +++++++---
 2 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index af25fae7234..7d68cf8d13d 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -79,9 +79,12 @@ struct call_key_aggregated_e_op_t {
   __device__ auto operator()(
     thrust::tuple<vertex_t, vertex_t, weight_t> val /* major, minor key, weight */) const
   {
-    auto major = thrust::get<0>(val);
-    auto key   = thrust::get<1>(val);
-    auto w     = thrust::get<2>(val);
+    auto major     = thrust::get<0>(val);
+    auto key       = thrust::get<1>(val);
+    auto w         = thrust::get<2>(val);
+    auto row_value = matrix_partition_row_value_input.get(
+      matrix_partition.get_major_offset_from_major_nocheck(major));
+    auto key_val = kv_map.find(key)->second.load(cuda::std::memory_order_relaxed);
     return key_aggregated_e_op(major,
                                key,
                                w,
@@ -91,6 +94,52 @@ struct call_key_aggregated_e_op_t {
   }
 };
 
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t>
+struct is_first_in_run_t {
+  vertex_t const* major_vertices{nullptr};
+  __device__ bool operator()(size_t i) const
+  {
+    return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false;
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t>
+struct is_valid_vertex_t {
+  __device__ bool operator()(vertex_t v) const { return v != invalid_vertex_id<vertex_t>::value; }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t>
+struct invalidate_if_not_first_in_run_t {
+  vertex_t const* major_vertices{nullptr};
+  __device__ vertex_t operator()(size_t i) const
+  {
+    return ((i == 0) || (major_vertices[i] != major_vertices[i - 1]))
+             ? major_vertices[i]
+             : invalid_vertex_id<vertex_t>::value;
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, bool multi_gpu>
+struct vertex_local_offset_t {
+  vertex_partition_device_view_t<vertex_t, multi_gpu> vertex_partition{};
+  __device__ vertex_t operator()(vertex_t v) const
+  {
+    return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename ReduceOp, typename T>
+struct reduce_with_init_t {
+  ReduceOp reduce_op{};
+  T init{};
+  __device__ T operator()(T val) const { return reduce_op(val, init); }
+};
+
 template <typename vertex_t, typename edge_t, typename weight_t, bool multi_gpu>
 __global__ void for_all_major_for_all_nbr_mid_degree(
   matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu> matrix_partition,
@@ -523,9 +572,9 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
       tmp_major_vertices.resize(reduced_size, handle.get_stream());
       tmp_minor_keys.resize(tmp_major_vertices.size(), handle.get_stream());
       tmp_key_aggregated_edge_weights.resize(tmp_major_vertices.size(), handle.get_stream());
-      tmp_major_vertices.shrink_to_fit(handle.get_stream());
       tmp_minor_keys.shrink_to_fit(handle.get_stream());
       tmp_key_aggregated_edge_weights.shrink_to_fit(handle.get_stream());
+      tmp_major_vertices.shrink_to_fit(handle.get_stream());
     }
 
     if (GraphViewType::is_multi_gpu) {
@@ -682,50 +731,39 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                       major_vertices.end(),
                       get_dataframe_buffer_begin<T>(e_op_result_buffer));
 
-  auto num_uniques = thrust::count_if(
-    execution_policy,
-    thrust::make_counting_iterator(size_t{0}),
-    thrust::make_counting_iterator(major_vertices.size()),
-    [major_vertices = major_vertices.data()] __device__(auto i) {
-      return ((i == 0) || (major_vertices[i] != major_vertices[i - 1])) ? true : false;
-    });
+  auto num_uniques = thrust::count_if(execution_policy,
+                                      thrust::make_counting_iterator(size_t{0}),
+                                      thrust::make_counting_iterator(major_vertices.size()),
+                                      detail::is_first_in_run_t<vertex_t>{major_vertices.data()});
   rmm::device_uvector<vertex_t> unique_major_vertices(num_uniques, handle.get_stream());
 
   auto major_vertex_first = thrust::make_transform_iterator(
     thrust::make_counting_iterator(size_t{0}),
-    [major_vertices = major_vertices.data()] __device__(auto i) {
-      return ((i == 0) || (major_vertices[i] != major_vertices[i - 1]))
-               ? major_vertices[i]
-               : invalid_vertex_id<vertex_t>::value;
-    });
-  thrust::copy_if(
-    execution_policy,
-    major_vertex_first,
-    major_vertex_first + major_vertices.size(),
-    unique_major_vertices.begin(),
-    [] __device__(auto major) { return major != invalid_vertex_id<vertex_t>::value; });
-  thrust::reduce_by_key(
-    execution_policy,
-    major_vertices.begin(),
-    major_vertices.end(),
-    get_dataframe_buffer_begin<T>(e_op_result_buffer),
-    thrust::make_discard_iterator(),
-    thrust::make_permutation_iterator(
-      vertex_value_output_first,
-      thrust::make_transform_iterator(
-        unique_major_vertices.begin(),
-        [vertex_partition = vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
-           graph_view.get_vertex_partition_view())] __device__(auto v) {
-          return vertex_partition.get_local_vertex_offset_from_vertex_nocheck(v);
-        })),
-    thrust::equal_to<vertex_t>{},
-    reduce_op);
+    detail::invalidate_if_not_first_in_run_t<vertex_t>{major_vertices.data()});
+  thrust::copy_if(execution_policy,
+                  major_vertex_first,
+                  major_vertex_first + major_vertices.size(),
+                  unique_major_vertices.begin(),
+                  detail::is_valid_vertex_t<vertex_t>{});
+  thrust::reduce_by_key(execution_policy,
+                        major_vertices.begin(),
+                        major_vertices.end(),
+                        get_dataframe_buffer_begin<T>(e_op_result_buffer),
+                        thrust::make_discard_iterator(),
+                        thrust::make_permutation_iterator(
+                          vertex_value_output_first,
+                          thrust::make_transform_iterator(
+                            unique_major_vertices.begin(),
+                            detail::vertex_local_offset_t<vertex_t, GraphViewType::is_multi_gpu>{
+                              graph_view.get_vertex_partition_view()})),
+                        thrust::equal_to<vertex_t>{},
+                        reduce_op);
 
   thrust::transform(execution_policy,
                     vertex_value_output_first,
                     vertex_value_output_first + graph_view.get_number_of_local_vertices(),
                     vertex_value_output_first,
-                    [reduce_op, init] __device__(auto val) { return reduce_op(val, init); });
+                    detail::reduce_with_init_t<ReduceOp, T>{reduce_op, init});
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index a2e448e85a0..29153fc2d37 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -73,6 +73,46 @@ struct key_aggregated_edge_op_t {
   }
 };
 
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct reduce_op_t {
+  __device__ auto operator()(thrust::tuple<vertex_t, weight_t> p0,
+                             thrust::tuple<vertex_t, weight_t> p1) const
+  {
+    auto id0 = thrust::get<0>(p0);
+    auto id1 = thrust::get<0>(p1);
+    auto wt0 = thrust::get<1>(p0);
+    auto wt1 = thrust::get<1>(p1);
+
+    return (wt0 < wt1) ? p1 : ((wt0 > wt1) ? p0 : ((id0 < id1) ? p0 : p1));
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct cluster_update_op_t {
+  bool up_down{};
+  __device__ auto operator()(vertex_t old_cluster, thrust::tuple<vertex_t, weight_t> p) const
+  {
+    vertex_t new_cluster      = thrust::get<0>(p);
+    weight_t delta_modularity = thrust::get<1>(p);
+
+    return (delta_modularity > weight_t{0})
+             ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
+             : old_cluster;
+  }
+};
+
+// a workaround for cudaErrorInvalidDeviceFunction error when device lambda is used
+template <typename vertex_t, typename weight_t>
+struct return_edge_weight_t {
+  __device__ auto operator()(
+    vertex_t, vertex_t, weight_t w, thrust::nullopt_t, thrust::nullopt_t) const
+  {
+    return w;
+  }
+};
+
 }  // namespace detail
 
 template <typename graph_view_type>
@@ -473,14 +513,7 @@ class Louvain {
       cluster_keys_v_.end(),
       cluster_weights_v_.begin(),
       detail::key_aggregated_edge_op_t<vertex_t, weight_t>{total_edge_weight, resolution},
-      [] __device__(auto p1, auto p2) {
-        auto id1 = thrust::get<0>(p1);
-        auto id2 = thrust::get<0>(p2);
-        auto wt1 = thrust::get<1>(p1);
-        auto wt2 = thrust::get<1>(p2);
-
-        return (wt1 < wt2) ? p2 : ((wt1 > wt2) ? p1 : ((id1 < id2) ? p1 : p2));
-      },
+      detail::reduce_op_t<vertex_t, weight_t>{},
       thrust::make_tuple(vertex_t{-1}, weight_t{0}),
       cugraph::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(output_buffer));
 
@@ -490,20 +523,13 @@ class Louvain {
       next_clusters_v_.end(),
       cugraph::get_dataframe_buffer_begin<thrust::tuple<vertex_t, weight_t>>(output_buffer),
       next_clusters_v_.begin(),
-      [up_down] __device__(vertex_t old_cluster, auto p) {
-        vertex_t new_cluster      = thrust::get<0>(p);
-        weight_t delta_modularity = thrust::get<1>(p);
-
-        return (delta_modularity > weight_t{0})
-                 ? (((new_cluster > old_cluster) != up_down) ? old_cluster : new_cluster)
-                 : old_cluster;
-      });
+      detail::cluster_update_op_t<vertex_t, weight_t>{up_down});
 
     if constexpr (graph_view_t::is_multi_gpu) {
       copy_to_adj_matrix_row(
         handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_);
-      copy_to_adj_matrix_row(
-        handle_, current_graph_view_, next_clusters_v_.begin(), src_clusters_cache_);
+      copy_to_adj_matrix_col(
+        handle_, current_graph_view_, next_clusters_v_.begin(), dst_clusters_cache_);
     }
 
     std::tie(cluster_keys_v_, cluster_weights_v_) =
@@ -516,7 +542,7 @@ class Louvain {
           ? src_clusters_cache_.device_view()
           : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
               next_clusters_v_.data()),
-        [] __device__(auto, auto, auto wt, auto, auto) { return wt; },
+        detail::return_edge_weight_t<vertex_t, weight_t>{},
         weight_t{0});
   }
 

From 44862f745629b902f1c41e7032aed8dd3b7e8e2a Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 14:02:06 -0400
Subject: [PATCH 25/57] add optional variables storing local unique edge
 rows/columns to graph_t & graph_view_t

---
 cpp/include/cugraph/graph.hpp      | 5 +++++
 cpp/include/cugraph/graph_view.hpp | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index de33469e792..69ab403bbdb 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -131,6 +131,11 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   // segment offsets within the vertex partition based on vertex degree, relevant only if
   // segment_offsets.size() > 0
   std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets_{std::nullopt};
+
+  // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge
+  // rows/cols << V / row_comm_size|col_comm_size).
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_rows{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_cols{std::nullopt};
 };
 
 // single-GPU version
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 81aa00fd2ea..f0040b9acb3 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -590,7 +590,8 @@ class graph_view_t<vertex_t,
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets_{};
 
-  // FIXME: to be implemented.
+  // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge
+  // rows/cols << V / row_comm_size|col_comm_size).
   std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{std::nullopt};
   std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{std::nullopt};
   std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{std::nullopt};

From a04844223131f58b8e63d25d1d6c19f5f992a617 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 14:14:49 -0400
Subject: [PATCH 26/57] update renumber_edgelist (in MG) to return # local
 unique edge rows/cols in addition (this information will be used in future
 memory scaling optimization)

---
 cpp/include/cugraph/graph_functions.hpp |  13 +--
 cpp/src/structure/renumber_edgelist.cu  | 115 +++++++++++++++---------
 2 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 61bf30e86a2..7e0c41603cd 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -69,10 +69,11 @@ namespace cugraph {
  * for further memory footprint optimization if provided.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
  * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t,
- * std::vector<vertex_t>> Tuple of labels (vertex IDs before renumbering) for the entire set of
- * vertices (assigned to this process in multi-GPU), partition_t object storing graph partitioning
- * information, total number of vertices, total number of edges, and vertex partition segment
- * offsets (a vertex partition is partitioned to multiple segments based on vertex degrees).
+ * std::vector<vertex_t>, vertex_t, vertex_t> Tuple of labels (vertex IDs before renumbering) for
+ * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing
+ * graph partitioning information, total number of vertices, total number of edges, vertex partition
+ * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees),
+ * and the number of unique edge rows and columns.
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::enable_if_t<multi_gpu,
@@ -80,7 +81,9 @@ std::enable_if_t<multi_gpu,
                             partition_t<vertex_t>,
                             vertex_t,
                             edge_t,
-                            std::vector<vertex_t>>>
+                            std::vector<vertex_t>,
+                            vertex_t,
+                            vertex_t>>
 renumber_edgelist(
   raft::handle_t const& handle,
   std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
diff --git a/cpp/src/structure/renumber_edgelist.cu b/cpp/src/structure/renumber_edgelist.cu
index ecdb06e399a..436ad674443 100644
--- a/cpp/src/structure/renumber_edgelist.cu
+++ b/cpp/src/structure/renumber_edgelist.cu
@@ -44,13 +44,14 @@
 namespace cugraph {
 namespace detail {
 
+// returns renumber map, segment_offsets, and # unique edge majors & minors
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumber_map(
-  raft::handle_t const& handle,
-  std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
-  std::vector<vertex_t const*> const& edgelist_major_vertices,
-  std::vector<vertex_t const*> const& edgelist_minor_vertices,
-  std::vector<edge_t> const& edgelist_edge_counts)
+std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertx_t, vertex_t>
+compute_renumber_map(raft::handle_t const& handle,
+                     std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
+                     std::vector<vertex_t const*> const& edgelist_major_vertices,
+                     std::vector<vertex_t const*> const& edgelist_minor_vertices,
+                     std::vector<edge_t> const& edgelist_edge_counts)
 {
   // FIXME: compare this sort based approach with hash based approach in both speed and memory
   // footprint
@@ -75,6 +76,7 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
 
   rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
   rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
+  vertex_t num_unique_edge_majors{0};
   for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
     rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
@@ -104,6 +106,7 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
                             tmp_major_labels.begin(),
                             tmp_major_counts.begin());
     }
+    num_unique_edge_majors += static_cast<vertex_t>(tmp_major_labels.size());
 
     if (multi_gpu) {
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
@@ -170,6 +173,7 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
     edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
   rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
                                              handle.get_stream());
+  vertex_t num_unique_edge_minors{0};
   for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
     thrust::copy(handle.get_thrust_policy(),
                  edgelist_minor_vertices[i],
@@ -182,6 +186,7 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
       minor_labels.begin(),
       thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())),
     handle.get_stream());
+  num_unique_edge_minors += static_cast<vertex_t>(minor_labels.size());
   if (multi_gpu) {
     auto& comm               = handle.get_comms();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -364,7 +369,8 @@ std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> compute_renumbe
                     handle.get_stream());
   handle.get_stream_view().synchronize();
 
-  return std::make_tuple(std::move(labels), h_segment_offsets);
+  return std::make_tuple(
+    std::move(labels), h_segment_offsets, num_unique_edge_majors, num_unique_edge_minors);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -609,7 +615,9 @@ std::enable_if_t<multi_gpu,
                             partition_t<vertex_t>,
                             vertex_t,
                             edge_t,
-                            std::vector<vertex_t>>>
+                            std::vector<vertex_t>,
+                            vertex_t,
+                            vertex_t>>
 renumber_edgelist(
   raft::handle_t const& handle,
   std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
@@ -648,7 +656,10 @@ renumber_edgelist(
 
   // 1. compute renumber map
 
-  auto [renumber_map_labels, vertex_partition_segment_offsets] =
+  auto [renumber_map_labels,
+        vertex_partition_segment_offsets,
+        num_unique_edge_majors,
+        num_unique_edge_minors] =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(handle,
                                                               local_vertex_span,
                                                               edgelist_const_major_vertices,
@@ -832,7 +843,9 @@ renumber_edgelist(
                          partition,
                          number_of_vertices,
                          number_of_edges,
-                         vertex_partition_segment_offsets);
+                         vertex_partition_segment_offsets,
+                         num_unique_edge_majors,
+                         num_unique_edge_minors);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
@@ -854,7 +867,9 @@ renumber_edgelist(raft::handle_t const& handle,
       std::nullopt);
   }
 
-  auto [renumber_map_labels, segment_offsets] =
+  rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
+  std::vector<vertex_t> segment_offsets{};
+  std::tie(renumber_map_labels, segment_offsets, std::ignore, std::ignore) =
     detail::compute_renumber_map<vertex_t, edge_t, multi_gpu>(
       handle,
       vertex_span,
@@ -893,17 +908,21 @@ renumber_edgelist(raft::handle_t const& handle,
 
 // instantiations for <vertex_t == int32_t, edge_t == int32_t>
 //
-template std::
-  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t, std::vector<int32_t>>
-  renumber_edgelist<int32_t, int32_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int32_t const*, int32_t>> local_vertex_span,
-    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int32_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int32_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    partition_t<int32_t>,
+                    int32_t,
+                    int32_t,
+                    std::vector<int32_t>,
+                    int32_t,
+                    int32_t>
+renumber_edgelist<int32_t, int32_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> local_vertex_span,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int32_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int32_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
 renumber_edgelist<int32_t, int32_t, false>(
@@ -916,17 +935,21 @@ renumber_edgelist<int32_t, int32_t, false>(
 
 // instantiations for <vertex_t == int32_t, edge_t == int64_t>
 //
-template std::
-  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t, std::vector<int32_t>>
-  renumber_edgelist<int32_t, int64_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int32_t const*, int32_t>> local_vertex_span,
-    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int64_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int64_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int32_t>,
+                    partition_t<int32_t>,
+                    int32_t,
+                    int64_t,
+                    std::vector<int32_t>,
+                    int32_t,
+                    int32_t>
+renumber_edgelist<int32_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> local_vertex_span,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
 renumber_edgelist<int32_t, int64_t, false>(
@@ -939,17 +962,21 @@ renumber_edgelist<int32_t, int64_t, false>(
 
 // instantiations for <vertex_t == int64_t, edge_t == int64_t>
 //
-template std::
-  tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t, std::vector<int64_t>>
-  renumber_edgelist<int64_t, int64_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int64_t const*, int64_t>> local_vertex_span,
-    std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int64_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int64_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int64_t>,
+                    partition_t<int64_t>,
+                    int64_t,
+                    int64_t,
+                    std::vector<int64_t>,
+                    int64_t,
+                    int64_t>
+renumber_edgelist<int64_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> local_vertex_span,
+  std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
 template std::tuple<rmm::device_uvector<int64_t>, std::vector<int64_t>>
 renumber_edgelist<int64_t, int64_t, false>(

From ed46464de5906491e54a6c4411aa1de2cb11725d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 14:25:39 -0400
Subject: [PATCH 27/57] fix an erroneous comment

---
 cpp/include/cugraph/graph_functions.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 7e0c41603cd..00331f15906 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -73,7 +73,7 @@ namespace cugraph {
  * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing
  * graph partitioning information, total number of vertices, total number of edges, vertex partition
  * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees),
- * and the number of unique edge rows and columns.
+ * and the number of unique edge major & minor vertex IDs.
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::enable_if_t<multi_gpu,

From 51f1fecb1c0dc38641c23c5c9d7e7da3e86c460d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 14:49:35 -0400
Subject: [PATCH 28/57] update the renumber_edgelist caller

---
 cpp/include/cugraph/utilities/cython.hpp              | 5 +++++
 cpp/src/structure/coarsen_graph_impl.cuh              | 8 +++++++-
 cpp/src/structure/create_graph_from_edgelist_impl.hpp | 7 +++----
 cpp/src/structure/renumber_edgelist_impl.cuh          | 2 +-
 cpp/src/utilities/cython.cu                           | 4 +++-
 5 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index 3a4f437bfd0..c7bbe68b2c2 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -270,6 +270,9 @@ struct renum_tuple_t {
     return std::make_unique<std::vector<vertex_t>>(segment_offsets_);
   }
 
+  vertex_t& get_num_unique_edge_majors(void) { return num_unique_edge_majors_; }
+  vertex_t& get_num_unique_edge_minors(void) { return num_unique_edge_minors_; }
+
   // `partition_t` pass-through getters
   //
   int get_part_row_size() const { return part_.get_row_size(); }
@@ -364,6 +367,8 @@ struct renum_tuple_t {
   vertex_t nv_{0};
   edge_t ne_{0};
   std::vector<vertex_t> segment_offsets_;
+  vertex_t num_unique_edge_majors_{0};
+  vertex_t num_unique_edge_minors_{0};
 };
 
 // FIXME: finish description for vertex_partition_offsets
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 345ce989e53..be424fcf0e3 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -437,7 +437,13 @@ coarsen_graph(
       minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data();
       counts[i]     = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
     }
-    std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges, segment_offsets) =
+    std::tie(renumber_map_labels,
+             partition,
+             number_of_vertices,
+             number_of_edges,
+             segment_offsets,
+             std::ignore,
+             std::ignore) =
       renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
         std::optional<std::tuple<vertex_t const*, vertex_t>>{
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
index 58991f2477c..d2353792a1b 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
@@ -23,9 +23,6 @@
 
 #include <raft/handle.hpp>
 
-#include <thrust/functional.h>
-#include <thrust/transform_reduce.h>
-
 #include <cstdint>
 #include <numeric>
 
@@ -112,7 +109,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
              partition,
              number_of_vertices,
              number_of_edges,
-             *vertex_partition_segment_offsets) =
+             *vertex_partition_segment_offsets,
+             std::ignore,
+             std::ignore) =
       cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
         local_vertex_span
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index e5d0bf0ba30..a990e5b378e 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -47,7 +47,7 @@ namespace detail {
 
 // returns renumber map, segment_offsets, and # unique edge majors & minors
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertx_t, vertex_t>
+std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>, vertex_t, vertex_t>
 compute_renumber_map(raft::handle_t const& handle,
                      std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
                      std::vector<vertex_t const*> const& edgelist_major_vertices,
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 25d42ec1f22..6f295871446 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -1216,7 +1216,9 @@ std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
              p_ret->get_partition(),
              p_ret->get_num_vertices(),
              p_ret->get_num_edges(),
-             p_ret->get_segment_offsets()) =
+             p_ret->get_segment_offsets(),
+             p_ret->get_num_unique_edge_majors(),
+             p_ret->get_num_unique_edge_minors()) =
       cugraph::renumber_edgelist<vertex_t, edge_t, true>(handle,
                                                          std::nullopt,
                                                          major_ptrs,

From 8b07a3c8a506ebba2930aa2afa021a7ab47e0015 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 15:49:38 -0400
Subject: [PATCH 29/57] update MG graph_t callers

---
 cpp/include/cugraph/graph.hpp                 |  2 +
 cpp/include/cugraph/graph_functions.hpp       |  2 +-
 cpp/include/cugraph/utilities/cython.hpp      |  4 ++
 cpp/src/structure/coarsen_graph_impl.cuh      | 10 +++--
 .../create_graph_from_edgelist_impl.hpp       | 10 +++--
 cpp/src/structure/graph_impl.cuh              |  2 +
 cpp/src/structure/renumber_edgelist_impl.cuh  | 14 ++++---
 cpp/src/utilities/cython.cu                   | 38 +++++++++++--------
 8 files changed, 53 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 69ab403bbdb..75ae57a2f88 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -70,6 +70,8 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           edge_t number_of_edges,
           graph_properties_t properties,
           std::optional<std::vector<vertex_t>> const& segment_offsets,
+          vertex_t num_local_unique_edge_rows,
+          vertex_t num_local_unique_edge_cols,
           bool do_expensive_check = false);
 
   bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 00331f15906..478df87d4c5 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -73,7 +73,7 @@ namespace cugraph {
  * the entire set of vertices (assigned to this process in multi-GPU), partition_t object storing
  * graph partitioning information, total number of vertices, total number of edges, vertex partition
  * segment offsets (a vertex partition is partitioned to multiple segments based on vertex degrees),
- * and the number of unique edge major & minor vertex IDs.
+ * and the number of local unique edge major & minor vertex IDs.
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 std::enable_if_t<multi_gpu,
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index c7bbe68b2c2..37000bd57e7 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -102,6 +102,8 @@ struct graph_container_t {
   size_t num_local_edges;
   size_t num_global_vertices;
   size_t num_global_edges;
+  size_t num_local_unique_edge_rows{};
+  size_t num_local_unique_edge_cols{};
   numberTypeEnum vertexType;
   numberTypeEnum edgeType;
   numberTypeEnum weightType;
@@ -436,6 +438,8 @@ void populate_graph_container(graph_container_t& graph_container,
                               size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
+                              size_t num_local_unique_edge_rows,
+                              size_t num_local_unique_edge_cols,
                               bool is_weighted,
                               bool is_symmetric,
                               bool transposed,
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index be424fcf0e3..70d96102f35 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -428,6 +428,8 @@ coarsen_graph(
   vertex_t number_of_vertices{};
   edge_t number_of_edges{};
   std::optional<std::vector<vertex_t>> segment_offsets{};
+  vertex_t num_local_unique_edge_majors{};
+  vertex_t num_local_unique_edge_minors{};
   {
     std::vector<vertex_t*> major_ptrs(coarsened_edgelist_major_vertices.size());
     std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
@@ -442,8 +444,8 @@ coarsen_graph(
              number_of_vertices,
              number_of_edges,
              segment_offsets,
-             std::ignore,
-             std::ignore) =
+             num_local_unique_edge_majors,
+             num_local_unique_edge_minors) =
       renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
         std::optional<std::tuple<vertex_t const*, vertex_t>>{
@@ -479,7 +481,9 @@ coarsen_graph(
       number_of_vertices,
       number_of_edges,
       graph_properties_t{graph_view.is_symmetric(), false},
-      segment_offsets),
+      segment_offsets,
+      store_transposed ? num_local_unique_edge_minors : num_local_unique_edge_majors,
+      store_transposed ? num_local_unique_edge_majors : num_local_unique_edge_minors),
     std::move(renumber_map_labels));
 }
 
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
index d2353792a1b..b7d21e9ac94 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
@@ -95,6 +95,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   cugraph::partition_t<vertex_t> partition{};
   vertex_t number_of_vertices{};
   edge_t number_of_edges{};
+  vertex_t num_local_unique_edge_majors{};
+  vertex_t num_local_unique_edge_minors{};
   auto vertex_partition_segment_offsets = std::make_optional<std::vector<vertex_t>>(0);
   {
     std::vector<vertex_t*> major_ptrs(col_comm_size);
@@ -110,8 +112,8 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
              number_of_vertices,
              number_of_edges,
              *vertex_partition_segment_offsets,
-             std::ignore,
-             std::ignore) =
+             num_local_unique_edge_majors,
+             num_local_unique_edge_minors) =
       cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
         handle,
         local_vertex_span
@@ -145,7 +147,9 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
       number_of_vertices,
       number_of_edges,
       graph_properties,
-      vertex_partition_segment_offsets),
+      vertex_partition_segment_offsets,
+      store_transposed ? num_local_unique_edge_minors : num_local_unique_edge_majors,
+      store_transposed ? num_local_unique_edge_majors : num_local_unique_edge_minors),
     std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
 }
 
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index b226427d613..48f94712808 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -203,6 +203,8 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
           edge_t number_of_edges,
           graph_properties_t properties,
           std::optional<std::vector<vertex_t>> const& segment_offsets,
+          vertex_t num_local_unique_edge_rows,
+          vertex_t num_local_unique_edge_cols,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, number_of_edges, properties),
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index a990e5b378e..c9749b3202d 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -77,7 +77,7 @@ compute_renumber_map(raft::handle_t const& handle,
 
   rmm::device_uvector<vertex_t> major_labels(0, handle.get_stream());
   rmm::device_uvector<edge_t> major_counts(0, handle.get_stream());
-  vertex_t num_unique_edge_majors{0};
+  vertex_t num_local_unique_edge_majors{0};
   for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
     rmm::device_uvector<vertex_t> tmp_major_labels(0, handle.get_stream());
     rmm::device_uvector<edge_t> tmp_major_counts(0, handle.get_stream());
@@ -107,7 +107,7 @@ compute_renumber_map(raft::handle_t const& handle,
                             tmp_major_labels.begin(),
                             tmp_major_counts.begin());
     }
-    num_unique_edge_majors += static_cast<vertex_t>(tmp_major_labels.size());
+    num_local_unique_edge_majors += static_cast<vertex_t>(tmp_major_labels.size());
 
     if (multi_gpu) {
       auto& col_comm = handle.get_subcomm(cugraph::partition_2d::key_naming_t().col_name());
@@ -174,7 +174,7 @@ compute_renumber_map(raft::handle_t const& handle,
     edgelist_edge_counts.begin(), edgelist_edge_counts.end() - 1, minor_displs.begin() + 1);
   rmm::device_uvector<vertex_t> minor_labels(minor_displs.back() + edgelist_edge_counts.back(),
                                              handle.get_stream());
-  vertex_t num_unique_edge_minors{0};
+  vertex_t num_local_unique_edge_minors{0};
   for (size_t i = 0; i < edgelist_minor_vertices.size(); ++i) {
     thrust::copy(handle.get_thrust_policy(),
                  edgelist_minor_vertices[i],
@@ -187,7 +187,7 @@ compute_renumber_map(raft::handle_t const& handle,
       minor_labels.begin(),
       thrust::unique(handle.get_thrust_policy(), minor_labels.begin(), minor_labels.end())),
     handle.get_stream());
-  num_unique_edge_minors += static_cast<vertex_t>(minor_labels.size());
+  num_local_unique_edge_minors += static_cast<vertex_t>(minor_labels.size());
   if (multi_gpu) {
     auto& comm               = handle.get_comms();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -370,8 +370,10 @@ compute_renumber_map(raft::handle_t const& handle,
                     handle.get_stream());
   handle.get_stream_view().synchronize();
 
-  return std::make_tuple(
-    std::move(labels), h_segment_offsets, num_unique_edge_majors, num_unique_edge_minors);
+  return std::make_tuple(std::move(labels),
+                         h_segment_offsets,
+                         num_local_unique_edge_majors,
+                         num_local_unique_edge_minors);
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 6f295871446..f95f9e38ea8 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -169,6 +169,8 @@ std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>> crea
           static_cast<vertex_t const*>(graph_container.segment_offsets) +
             graph_container.num_segments + 1)
       : std::nullopt,
+    graph_container.num_local_unique_edge_rows,
+    graph_container.num_local_unique_edge_cols,
     graph_container.do_expensive_check);
 }
 
@@ -221,6 +223,8 @@ void populate_graph_container(graph_container_t& graph_container,
                               size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
+                              size_t num_local_unique_edge_rows,
+                              size_t num_local_unique_edge_cols,
                               bool is_weighted,
                               bool is_symmetric,
                               bool transposed,
@@ -244,22 +248,24 @@ void populate_graph_container(graph_container_t& graph_container,
     graph_container.col_comm_rank = col_comm_rank;
   }
 
-  graph_container.src_vertices             = src_vertices;
-  graph_container.dst_vertices             = dst_vertices;
-  graph_container.weights                  = weights;
-  graph_container.is_weighted              = is_weighted;
-  graph_container.vertex_partition_offsets = vertex_partition_offsets;
-  graph_container.segment_offsets          = segment_offsets;
-  graph_container.num_segments             = num_segments;
-  graph_container.num_local_edges          = num_local_edges;
-  graph_container.num_global_vertices      = num_global_vertices;
-  graph_container.num_global_edges         = num_global_edges;
-  graph_container.vertexType               = vertexType;
-  graph_container.edgeType                 = edgeType;
-  graph_container.weightType               = weightType;
-  graph_container.transposed               = transposed;
-  graph_container.is_multi_gpu             = multi_gpu;
-  graph_container.do_expensive_check       = do_expensive_check;
+  graph_container.src_vertices               = src_vertices;
+  graph_container.dst_vertices               = dst_vertices;
+  graph_container.weights                    = weights;
+  graph_container.is_weighted                = is_weighted;
+  graph_container.vertex_partition_offsets   = vertex_partition_offsets;
+  graph_container.segment_offsets            = segment_offsets;
+  graph_container.num_segments               = num_segments;
+  graph_container.num_local_edges            = num_local_edges;
+  graph_container.num_global_vertices        = num_global_vertices;
+  graph_container.num_global_edges           = num_global_edges;
+  graph_container.num_local_unique_edge_rows = num_local_unique_edge_rows;
+  graph_container.num_local_unique_edge_cols = num_local_unique_edge_cols;
+  graph_container.vertexType                 = vertexType;
+  graph_container.edgeType                   = edgeType;
+  graph_container.weightType                 = weightType;
+  graph_container.transposed                 = transposed;
+  graph_container.is_multi_gpu               = multi_gpu;
+  graph_container.do_expensive_check         = do_expensive_check;
 
   graph_properties_t graph_props{.is_symmetric = is_symmetric, .is_multigraph = false};
   graph_container.graph_props = graph_props;

From f11be0897c6e57934afb16f45789c68e26447e9e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 8 Sep 2021 16:52:44 -0400
Subject: [PATCH 30/57] remove std::optional from the input parameter
 segment_offsets of MG graph_t (as renumbering is mandatory in MG, so
 segment_offsets is not optional for MG)

---
 cpp/include/cugraph/graph.hpp                 |  2 +-
 cpp/src/structure/coarsen_graph_impl.cuh      |  2 +-
 .../create_graph_from_edgelist_impl.hpp       |  4 ++--
 cpp/src/structure/graph_impl.cuh              | 22 ++++++++-----------
 cpp/src/utilities/cython.cu                   |  9 +++-----
 5 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 75ae57a2f88..09f29e714cb 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -69,7 +69,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           vertex_t number_of_vertices,
           edge_t number_of_edges,
           graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          std::vector<vertex_t> const& segment_offsets,
           vertex_t num_local_unique_edge_rows,
           vertex_t num_local_unique_edge_cols,
           bool do_expensive_check = false);
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 70d96102f35..2bd17098f93 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -427,7 +427,7 @@ coarsen_graph(
                                   col_comm_rank);
   vertex_t number_of_vertices{};
   edge_t number_of_edges{};
-  std::optional<std::vector<vertex_t>> segment_offsets{};
+  std::vector<vertex_t> segment_offsets{};
   vertex_t num_local_unique_edge_majors{};
   vertex_t num_local_unique_edge_minors{};
   {
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
index b7d21e9ac94..340dfebeda1 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
@@ -97,7 +97,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   edge_t number_of_edges{};
   vertex_t num_local_unique_edge_majors{};
   vertex_t num_local_unique_edge_minors{};
-  auto vertex_partition_segment_offsets = std::make_optional<std::vector<vertex_t>>(0);
+  std::vector<vertex_t> vertex_partition_segment_offsets{};
   {
     std::vector<vertex_t*> major_ptrs(col_comm_size);
     std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
@@ -111,7 +111,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
              partition,
              number_of_vertices,
              number_of_edges,
-             *vertex_partition_segment_offsets,
+             vertex_partition_segment_offsets,
              num_local_unique_edge_majors,
              num_local_unique_edge_minors) =
       cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 48f94712808..8a998f7854f 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -202,7 +202,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
           vertex_t number_of_vertices,
           edge_t number_of_edges,
           graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          std::vector<vertex_t> const& segment_offsets,
           vertex_t num_local_unique_edge_rows,
           vertex_t num_local_unique_edge_cols,
           bool do_expensive_check)
@@ -227,16 +227,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
                   "Invalid input argument: errneous edgelists.size().");
   CUGRAPH_EXPECTS(
-    !segment_offsets.has_value() ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)) ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
+    (segment_offsets.size() == (detail::num_sparse_segments_per_vertex_partition + 1)) ||
+      (segment_offsets.size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
     "Invalid input argument: segment_offsets.size() returns an invalid value.");
 
   auto is_weighted = edgelists[0].p_edge_weights.has_value();
-  auto use_dcs =
-    segment_offsets
-      ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
-      : false;
+  auto use_dcs = segment_offsets.size() > (detail::num_sparse_segments_per_vertex_partition + 1);
 
   CUGRAPH_EXPECTS(
     std::any_of(edgelists.begin(),
@@ -286,12 +282,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
 
   // aggregate segment_offsets
 
-  if (segment_offsets) {
+  {
     // FIXME: we need to add host_allgather
-    rmm::device_uvector<vertex_t> d_segment_offsets((*segment_offsets).size(), default_stream_view);
+    rmm::device_uvector<vertex_t> d_segment_offsets(segment_offsets.size(), default_stream_view);
     raft::update_device(d_segment_offsets.data(),
-                        (*segment_offsets).data(),
-                        (*segment_offsets).size(),
+                        segment_offsets.data(),
+                        segment_offsets.size(),
                         default_stream_view.value());
     rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
       col_comm_size * d_segment_offsets.size(), default_stream_view);
@@ -332,7 +328,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     auto major_hypersparse_first =
       use_dcs ? std::optional<vertex_t>{major_first +
                                         (*adj_matrix_partition_segment_offsets_)
-                                          [(*segment_offsets).size() * i +
+                                          [segment_offsets.size() * i +
                                            detail::num_sparse_segments_per_vertex_partition]}
               : std::nullopt;
     auto [offsets, indices, weights, dcs_nzd_vertices] =
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index f95f9e38ea8..71bdac0448c 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -163,12 +163,9 @@ std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>> crea
     static_cast<vertex_t>(graph_container.num_global_vertices),
     static_cast<edge_t>(graph_container.num_global_edges),
     graph_container.graph_props,
-    graph_container.segment_offsets != nullptr
-      ? std::make_optional<std::vector<vertex_t>>(
-          static_cast<vertex_t const*>(graph_container.segment_offsets),
-          static_cast<vertex_t const*>(graph_container.segment_offsets) +
-            graph_container.num_segments + 1)
-      : std::nullopt,
+    std::vector<vertex_t>(static_cast<vertex_t const*>(graph_container.segment_offsets),
+                          static_cast<vertex_t const*>(graph_container.segment_offsets) +
+                            graph_container.num_segments + 1),
     graph_container.num_local_unique_edge_rows,
     graph_container.num_local_unique_edge_cols,
     graph_container.do_expensive_check);

From 529544cf46f957b87558ecd6a924b559dacbea78 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 9 Sep 2021 15:22:20 -0400
Subject: [PATCH 31/57] update local unique edge rows/cols if using (key,
 value) pairs instead of contiguous arrays for row/col properties

---
 cpp/include/cugraph/graph.hpp      |   4 +-
 cpp/include/cugraph/graph_view.hpp |   5 +
 cpp/src/structure/graph_impl.cuh   | 146 ++++++++++++++++++++++++++++-
 3 files changed, 148 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 09f29e714cb..5bd5d5a5be3 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -136,8 +136,8 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
 
   // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge
   // rows/cols << V / row_comm_size|col_comm_size).
-  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_rows{std::nullopt};
-  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_cols{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_rows_{std::nullopt};
+  std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_cols_{std::nullopt};
 };
 
 // single-GPU version
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index f0040b9acb3..85135575f5c 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -223,6 +223,11 @@ namespace detail {
 
 using namespace cugraph::visitors;
 
+// FIXME: threshold values require tuning
+// use (key, value) pairs to store row/column properties if (unique edge rows/cols) over (V /
+// row_comm_size|col_comm_size) is smaller than the threshold value
+double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.25;
+
 // FIXME: threshold values require tuning
 // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
 // than col_comm_size * hypersparse_threshold_ratio, should be less than 1.0
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 8a998f7854f..09a32b8f9a2 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -41,7 +41,7 @@ namespace cugraph {
 
 namespace {
 
-// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_view_t") for an
+// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
 // extended __device__ lambda must allow its address to be taken)
 template <typename vertex_t>
 struct out_of_range_t {
@@ -59,6 +59,20 @@ struct out_of_range_t {
   }
 };
 
+// can't use lambda due to nvcc limitations (The enclosing parent function ("graph_t") for an
+// extended __device__ lambda must allow its address to be taken)
+template <typename vertex_t, typename edge_t>
+struct has_nzd_t {
+  edge_t const* offsets{nullptr};
+  vertex_t major_first{};
+
+  __device__ bool operator()(vertex_t major) const
+  {
+    auto major_offset = major - major_first;
+    return offsets[major_offset + 1] - offsets[major_offset] > 0;
+  }
+};
+
 // compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid
 template <bool store_transposed, typename vertex_t, typename edge_t, typename weight_t>
 std::tuple<rmm::device_uvector<edge_t>,
@@ -251,12 +265,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   // optional expensive checks (part 1/2)
 
   if (do_expensive_check) {
-    edge_t number_of_local_edges_sum{};
+    edge_t number_of_local_edges{};
     for (size_t i = 0; i < edgelists.size(); ++i) {
       auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
       auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
 
-      number_of_local_edges_sum += edgelists[i].number_of_edges;
+      number_of_local_edges += edgelists[i].number_of_edges;
 
       auto edge_first = thrust::make_zip_iterator(thrust::make_tuple(
         store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices,
@@ -269,8 +283,8 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                          major_first, major_last, minor_first, minor_last}) == 0,
                       "Invalid input argument: edgelists[] have out-of-range values.");
     }
-    number_of_local_edges_sum =
-      host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream_view.value());
+    auto number_of_local_edges_sum =
+      host_scalar_allreduce(comm, number_of_local_edges, default_stream_view.value());
     CUGRAPH_EXPECTS(
       number_of_local_edges_sum == this->get_number_of_edges(),
       "Invalid input argument: the sum of local edge counts does not match with number_of_edges.");
@@ -278,6 +292,39 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     CUGRAPH_EXPECTS(
       partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
       "Invalid input argument: vertex partition should cover [0, number_of_vertices).");
+
+    rmm::device_uvector<vertex_t> majors(number_of_local_edges, handle.get_stream());
+    rmm::device_uvector<vertex_t> minors(number_of_local_edges, handle.get_stream());
+    size_t cur_size{0};
+    for (size_t i = 0; i < edgelists.size(); ++i) {
+      auto p_majors = store_transposed ? edgelists[i].p_dst_vertices : edgelists[i].p_src_vertices;
+      auto p_minors = store_transposed ? edgelists[i].p_src_vertices : edgelists[i].p_dst_vertices;
+      thrust::copy(handle.get_thrust_policy(),
+                   p_majors,
+                   p_majors + edgelists[i].number_of_edges,
+                   majors.begin() + cur_size);
+      thrust::copy(handle.get_thrust_policy(),
+                   p_minors,
+                   p_minors + edgelists[i].number_of_edges,
+                   minors.begin() + cur_size);
+    }
+    thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end());
+    thrust::sort(handle.get_thrust_policy(), minors.begin(), minors.end());
+    auto num_local_unique_edge_majors = static_cast<vertex_t>(thrust::distance(
+      majors.begin(), thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())));
+    auto num_local_unique_edge_minors = static_cast<vertex_t>(thrust::distance(
+      minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end())));
+    if constexpr (store_transposed) {
+      CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_cols,
+                      "Invalid input argument: num_unique_edge_cols is erroneous.");
+      CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_rows,
+                      "Invalid input argument: num_unique_edge_rows is erroneous.");
+    } else {
+      CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_rows,
+                      "Invalid input argument: num_unique_edge_rows is erroneous.");
+      CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_cols,
+                      "Invalid input argument: num_unique_edge_cols is erroneous.");
+    }
   }
 
   // aggregate segment_offsets
@@ -350,6 +397,95 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     }
   }
 
+  // if # unique edge rows/cols << V / row_comm_size|col_comm_size, store unique edge rows/cols to
+  // support storing edge row/column properties in (key, value) pairs.
+
+  auto num_local_unique_edge_majors =
+    store_transposed ? num_local_unique_edge_cols : num_local_unique_edge_rows;
+  auto num_local_unique_edge_minors =
+    store_transposed ? num_local_unique_edge_rows : num_local_unique_edge_cols;
+
+  vertex_t aggregate_major_size{0};
+  for (size_t i = 0; i < partition.get_number_of_matrix_partitions(); ++i) {
+    aggregate_major_size += partition.get_matrix_partition_major_size(i);
+  }
+  auto minor_size                      = partition.get_matrix_partition_minor_size();
+  auto max_major_properties_fill_ratio = host_scalar_allreduce(
+    comm,
+    static_cast<double>(num_local_unique_edge_majors) / static_cast<double>(aggregate_major_size),
+    handle.get_stream());
+  auto max_minor_properties_fill_ratio = host_scalar_allreduce(
+    comm,
+    static_cast<double>(num_local_unique_edge_minors) / static_cast<double>(minor_size),
+    handle.get_stream());
+
+  if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
+    rmm::device_uvector<vertex_t> local_sorted_unique_edge_majors(num_local_unique_edge_majors,
+                                                                  handle.get_stream());
+    size_t cur_size{0};
+    for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
+      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+      cur_size += thrust::distance(
+        local_sorted_unique_edge_majors.data() + cur_size,
+        thrust::copy_if(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(major_first),
+          thrust::make_counting_iterator(major_last),
+          local_sorted_unique_edge_majors.data() + cur_size,
+          has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), major_first}));
+    }
+    assert(cur_size == num_local_unique_edge_majors);
+    if constexpr (store_transposed) {
+      local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors);
+    } else {
+      local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors);
+    }
+  }
+
+  if (max_minor_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {
+    rmm::device_uvector<vertex_t> local_sorted_unique_edge_minors(0, handle.get_stream());
+    for (size_t i = 0; i < adj_matrix_partition_indices_.size(); ++i) {
+      rmm::device_uvector<vertex_t> tmp_minors(adj_matrix_partition_indices_[i].size(),
+                                               handle.get_stream());
+      thrust::copy(handle.get_thrust_policy(),
+                   adj_matrix_partition_indices_[i].begin(),
+                   adj_matrix_partition_indices_[i].end(),
+                   tmp_minors.begin());
+      thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end());
+      tmp_minors.resize(
+        thrust::distance(
+          tmp_minors.begin(),
+          thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())),
+        handle.get_stream());
+      auto cur_size = local_sorted_unique_edge_minors.size();
+      if (cur_size == 0) {
+        local_sorted_unique_edge_minors = std::move(tmp_minors);
+      } else {
+        local_sorted_unique_edge_minors.resize(
+          local_sorted_unique_edge_minors.size() + tmp_minors.size(), handle.get_stream());
+        thrust::copy(handle.get_thrust_policy(),
+                     tmp_minors.begin(),
+                     tmp_minors.end(),
+                     local_sorted_unique_edge_minors.begin() + cur_size);
+      }
+    }
+    thrust::sort(handle.get_thrust_policy(),
+                 local_sorted_unique_edge_minors.begin(),
+                 local_sorted_unique_edge_minors.end());
+    local_sorted_unique_edge_minors.resize(
+      thrust::distance(local_sorted_unique_edge_minors.begin(),
+                       thrust::unique(handle.get_thrust_policy(),
+                                      local_sorted_unique_edge_minors.begin(),
+                                      local_sorted_unique_edge_minors.end())),
+      handle.get_stream());
+    local_sorted_unique_edge_minors.shrink_to_fit(handle.get_stream());
+    if constexpr (store_transposed) {
+      local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors);
+    } else {
+      local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors);
+    }
+  }
+
   // optional expensive checks (part 2/2)
 
   if (do_expensive_check) {

From d2549ccd94d95d94c5327ea1cff55625e9b52b0d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 9 Sep 2021 16:27:57 -0400
Subject: [PATCH 32/57] update graph_view to take optional local edge rows/cols

---
 cpp/include/cugraph/graph.hpp         |  8 ++++++++
 cpp/include/cugraph/graph_view.hpp    |  4 ++++
 cpp/src/structure/graph_view_impl.cuh | 10 +++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 5bd5d5a5be3..8b40287f3ae 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -115,6 +115,14 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
       this->get_number_of_edges(),
       this->get_graph_properties(),
       adj_matrix_partition_segment_offsets_,
+      local_sorted_unique_edge_rows_ ? std::make_optional((*local_sorted_unique_edge_rows_).data()) : std::nullopt,
+      local_sorted_unique_edge_rows_
+        ? std::make_optional((*local_sorted_unique_edge_rows_).data() + (*local_sorted_unique_edge_rows_).size())
+        : std::nullopt,
+      local_sorted_unique_edge_cols_ ? std::make_optional((*local_sorted_unique_edge_cols_).data()) : std::nullopt,
+      local_sorted_unique_edge_cols_
+        ? std::make_optional((*local_sorted_unique_edge_cols_).data() + (*local_sorted_unique_edge_cols_).size())
+        : std::nullopt,
       false);
   }
 
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 85135575f5c..ae7bffa5e18 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -332,6 +332,10 @@ class graph_view_t<vertex_t,
     edge_t number_of_edges,
     graph_properties_t properties,
     std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    std::optional<vertex_t const*> local_sorted_unique_edge_row_first,
+    std::optional<vertex_t const*> local_sorted_unique_edge_row_last,
+    std::optional<vertex_t const*> local_sorted_unique_edge_col_first,
+    std::optional<vertex_t const*> local_sorted_unique_edge_col_last,
     bool do_expensive_check = false);
 
   bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 156d86e5e76..b34ac1f5253 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -168,6 +168,10 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
     edge_t number_of_edges,
     graph_properties_t properties,
     std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    std::optional<vertex_t const*> local_sorted_unique_edge_row_first,
+    std::optional<vertex_t const*> local_sorted_unique_edge_row_last,
+    std::optional<vertex_t const*> local_sorted_unique_edge_col_first,
+    std::optional<vertex_t const*> local_sorted_unique_edge_col_last,
     bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
       handle, number_of_vertices, number_of_edges, properties),
@@ -183,7 +187,11 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                                               adj_matrix_partition_segment_offsets,
                                               handle.get_stream())),
     partition_(partition),
-    adj_matrix_partition_segment_offsets_(adj_matrix_partition_segment_offsets)
+    adj_matrix_partition_segment_offsets_(adj_matrix_partition_segment_offsets),
+    local_sorted_unique_edge_row_first_(local_sorted_unique_edge_row_first),
+    local_sorted_unique_edge_row_last_(local_sorted_unique_edge_row_first),
+    local_sorted_unique_edge_col_first_(local_sorted_unique_edge_row_first),
+    local_sorted_unique_edge_col_last_(local_sorted_unique_edge_row_first)
 {
   // cheap error checks
 

From 6215eb3d866cc80b7c85a7d72afafab9c1578741 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 10 Sep 2021 15:36:49 -0400
Subject: [PATCH 33/57] refactor input parameters/return values of
 renumber_edgelist & graph_(view_)t constructors to better group meta
 information

---
 cpp/include/cugraph/graph.hpp                 |  56 ++++++---
 cpp/include/cugraph/graph_functions.hpp       |  50 +++++---
 cpp/include/cugraph/graph_view.hpp            |  38 ++++--
 .../cugraph/visitors/graph_factory.hpp        |  26 ++--
 cpp/src/structure/coarsen_graph_impl.cuh      |  48 ++++----
 .../create_graph_from_edgelist_impl.hpp       |  66 +++++-----
 cpp/src/structure/graph_impl.cuh              |  66 +++++-----
 cpp/src/structure/graph_view_impl.cuh         | 116 +++++++++---------
 cpp/src/structure/renumber_edgelist_impl.cuh  |  25 ++--
 cpp/src/structure/renumber_edgelist_mg.cu     |  60 ++++-----
 cpp/src/structure/renumber_edgelist_sg.cu     |   6 +-
 cpp/src/utilities/cython.cu                   |  68 +++++-----
 cpp/tests/community/mg_louvain_helper.cu      |   7 +-
 cpp/tests/structure/graph_test.cpp            |   5 +-
 cpp/tests/visitors/bfs_test.cpp               |   3 +-
 15 files changed, 332 insertions(+), 308 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index de33469e792..0ab801f2191 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -37,6 +37,32 @@ struct edgelist_t {
   edge_t number_of_edges{0};
 };
 
+template <typename vertex_t, typename edge_t, bool multi_gpu, typename Enable = void>
+struct graph_meta_t;
+
+// multi-GPU version
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct graph_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  graph_properties_t properties{};
+
+  partition_t<vertex_t> partition{};
+
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+};
+
+// single-GPU version
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct graph_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>> {
+  vertex_t number_of_vertices{};
+  graph_properties_t properties{};
+
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+};
+
 // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class)
 template <typename vertex_t,
           typename edge_t,
@@ -65,11 +91,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
 
   graph_t(raft::handle_t const& handle,
           std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
-          partition_t<vertex_t> const& partition,
-          vertex_t number_of_vertices,
-          edge_t number_of_edges,
-          graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
   bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
@@ -108,11 +130,13 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
       weights,
       dcs_nzd_vertices,
       dcs_nzd_vertex_counts,
-      partition_,
-      this->get_number_of_vertices(),
-      this->get_number_of_edges(),
-      this->get_graph_properties(),
-      adj_matrix_partition_segment_offsets_,
+      graph_view_meta_t<vertex_t, edge_t, multi_gpu>{
+        this->get_number_of_vertices(),
+        this->get_number_of_edges(),
+        this->get_graph_properties(),
+        partition_,
+        adj_matrix_partition_segment_offsets_,
+      },
       false);
   }
 
@@ -155,9 +179,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
 
   graph_t(raft::handle_t const& handle,
           edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
-          vertex_t number_of_vertices,
-          graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check = false);
 
   bool is_weighted() const { return weights_.has_value(); }
@@ -169,10 +191,10 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
       offsets_.data(),
       indices_.data(),
       weights_ ? std::optional<weight_t const*>{(*weights_).data()} : std::nullopt,
-      this->get_number_of_vertices(),
-      this->get_number_of_edges(),
-      this->get_graph_properties(),
-      segment_offsets_,
+      graph_view_meta_t<vertex_t, edge_t, multi_gpu>{this->get_number_of_vertices(),
+                                                     this->get_number_of_edges(),
+                                                     this->get_graph_properties(),
+                                                     segment_offsets_},
       false);
   }
 
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 61bf30e86a2..e11a1f8a2dc 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -28,6 +28,23 @@
 
 namespace cugraph {
 
+template <typename vertex_t, typename edge_t, bool multi_gpu, typename Enable = void>
+struct renumber_meta_t {
+};
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {
+  vertex_t number_of_vertices{};
+  edge_t number_of_edges{};
+  partition_t<vertex_t> partition{};
+  std::vector<vertex_t> segment_offsets{};
+};
+
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>> {
+  std::vector<vertex_t> segment_offsets{};
+};
+
 /**
  * @brief renumber edgelist (multi-GPU)
  *
@@ -68,19 +85,18 @@ namespace cugraph {
  * compute_gpu_id_from_vertex_t function to edge minor vertex IDs. This optinoal information is used
  * for further memory footprint optimization if provided.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return std::tuple<rmm::device_uvector<vertex_t>, partition_t<vertex_t>, vertex_t, edge_t,
- * std::vector<vertex_t>> Tuple of labels (vertex IDs before renumbering) for the entire set of
- * vertices (assigned to this process in multi-GPU), partition_t object storing graph partitioning
- * information, total number of vertices, total number of edges, and vertex partition segment
- * offsets (a vertex partition is partitioned to multiple segments based on vertex degrees).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>
+ * Tuple of labels (vertex IDs before renumbering) for the entire set of vertices (assigned to this
+ * process in multi-GPU) and meta-data collected while renumbering. The meta-data includes total
+ * number of vertices, total number of edges, partition_t object storing graph partitioning
+ * information,  and vertex partition segment offsets (a vertex partition is partitioned to multiple
+ * segments based on vertex degrees). This meta-data is expected to be used in graph construction &
+ * graph primitives.
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::enable_if_t<multi_gpu,
-                 std::tuple<rmm::device_uvector<vertex_t>,
-                            partition_t<vertex_t>,
-                            vertex_t,
-                            edge_t,
-                            std::vector<vertex_t>>>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(
   raft::handle_t const& handle,
   std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
@@ -112,12 +128,16 @@ renumber_edgelist(
  * Vertex IDs are updated in-place ([INOUT] parameter).
  * @param num_edgelist_edges Number of edges in the edgelist.
  * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`).
- * @return std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>> Tuple of abels (vertex
- * IDs before renumbering) for the entire set of vertices and vertex partition segment offsets (a
- * vertex partition is partitioned to multiple segments based on vertex degrees).
+ * @return std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>
+ * Tuple of labels (vertex IDs before renumbering) for the entire set of vertices and meta-data
+ * collected while renumbering. The meta-data includes vertex partition segment offsets (a vertex
+ * partition is partitioned to multiple segments based on vertex degrees). This meta-data is
+ * expected to be used in graph construction & graph primitives.
  */
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::enable_if_t<!multi_gpu, std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>>>
+std::enable_if_t<
+  !multi_gpu,
+  std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(raft::handle_t const& handle,
                   std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
                   vertex_t* edgelist_major_vertices /* [INOUT] */,
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 81aa00fd2ea..cb18571886c 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -286,6 +286,33 @@ class graph_base_t : public graph_envelope_t::base_graph_t /*<- visitor logic*/
 
 }  // namespace detail
 
+template <typename vertex_t, typename edge_t, bool multi_gpu, typename Enable = void>
+struct graph_view_meta_t;
+
+// multi-GPU version
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct graph_view_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {
+  vertex_t number_of_vertices;
+  edge_t number_of_edges;
+  graph_properties_t properties;
+
+  partition_t<vertex_t> partition{};
+
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets{};
+};
+
+// single-GPU version
+template <typename vertex_t, typename edge_t, bool multi_gpu>
+struct graph_view_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<!multi_gpu>> {
+  vertex_t number_of_vertices;
+  edge_t number_of_edges;
+  graph_properties_t properties;
+
+  // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
+  std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+};
+
 // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class)
 template <typename vertex_t,
           typename edge_t,
@@ -322,11 +349,7 @@ class graph_view_t<vertex_t,
     std::optional<std::vector<weight_t const*>> const& adj_matrix_partition_weights,
     std::optional<std::vector<vertex_t const*>> const& adj_matrix_partition_dcs_nzd_vertices,
     std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
-    partition_t<vertex_t> const& partition,
-    vertex_t number_of_vertices,
-    edge_t number_of_edges,
-    graph_properties_t properties,
-    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    graph_view_meta_t<vertex_t, edge_t, multi_gpu> meta,
     bool do_expensive_check = false);
 
   bool is_weighted() const { return adj_matrix_partition_weights_.has_value(); }
@@ -621,10 +644,7 @@ class graph_view_t<vertex_t,
                edge_t const* offsets,
                vertex_t const* indices,
                std::optional<weight_t const*> weights,
-               vertex_t number_of_vertices,
-               edge_t number_of_edges,
-               graph_properties_t properties,
-               std::optional<std::vector<vertex_t>> const& segment_offsets,
+               graph_view_meta_t<vertex_t, edge_t, multi_gpu> meta,
                bool do_expensive_check = false);
 
   bool is_weighted() const { return weights_.has_value(); }
diff --git a/cpp/include/cugraph/visitors/graph_factory.hpp b/cpp/include/cugraph/visitors/graph_factory.hpp
index 5e25624f814..9c8198bf3b0 100644
--- a/cpp/include/cugraph/visitors/graph_factory.hpp
+++ b/cpp/include/cugraph/visitors/graph_factory.hpp
@@ -159,15 +159,11 @@ struct graph_factory_t<
     std::optional<std::vector<vertex_t>>
       opt_seg_off{};  // FIXME: may needd to pass/extract segment_offsets vector
 
+    graph_meta_t<vertex_t, edge_t, multi_gpu> meta{
+      num_global_vertices, num_global_edges, graph_props, partition, opt_seg_off};
+
     return std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
-      handle,
-      edgelist,
-      partition,
-      num_global_vertices,
-      num_global_edges,
-      graph_props,
-      opt_seg_off,
-      do_expensive_check);
+      handle, edgelist, meta, do_expensive_check);
   }
 };
 
@@ -184,24 +180,18 @@ struct graph_factory_t<
     /// std::cout << "Single-GPU factory.\n";
     std::vector<void*> const& v_args{ep.get_args()};
 
-    assert(v_args.size() == 6);
+    assert(v_args.size() == 4);
 
     raft::handle_t const& handle = *static_cast<raft::handle_t const*>(v_args[0]);
 
     auto const& elist = *static_cast<edgelist_t<vertex_t, edge_t, weight_t> const*>(v_args[1]);
 
-    auto nv = *static_cast<vertex_t*>(v_args[2]);
-
-    auto props = *static_cast<graph_properties_t*>(v_args[3]);
-
-    bool sorted = *static_cast<bool*>(v_args[4]);  // FIXME: no need to pass this!
-
-    bool check = *static_cast<bool*>(v_args[5]);
+    auto meta = *static_cast<graph_meta_t<vertex_t, edge_t, multi_gpu> const*>(v_args[2]);
 
-    std::optional<std::vector<vertex_t>> opt_seg_off{};  // should not be needed for (!multi_gpu)
+    bool check = *static_cast<bool*>(v_args[3]);
 
     return std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
-      handle, elist, nv, props, opt_seg_off, check);
+      handle, elist, meta, check);
   }
 };
 
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 345ce989e53..716c9f67993 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -420,14 +420,7 @@ coarsen_graph(
   // 4. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
-  partition_t<vertex_t> partition(std::vector<vertex_t>(comm_size + 1, 0),
-                                  row_comm_size,
-                                  col_comm_size,
-                                  row_comm_rank,
-                                  col_comm_rank);
-  vertex_t number_of_vertices{};
-  edge_t number_of_edges{};
-  std::optional<std::vector<vertex_t>> segment_offsets{};
+  renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
   {
     std::vector<vertex_t*> major_ptrs(coarsened_edgelist_major_vertices.size());
     std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
@@ -437,16 +430,15 @@ coarsen_graph(
       minor_ptrs[i] = coarsened_edgelist_minor_vertices[i].data();
       counts[i]     = static_cast<edge_t>(coarsened_edgelist_major_vertices[i].size());
     }
-    std::tie(renumber_map_labels, partition, number_of_vertices, number_of_edges, segment_offsets) =
-      renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-        handle,
-        std::optional<std::tuple<vertex_t const*, vertex_t>>{
-          std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
-        major_ptrs,
-        minor_ptrs,
-        counts,
-        std::nullopt,
-        do_expensive_check);
+    std::tie(renumber_map_labels, meta) = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      std::optional<std::tuple<vertex_t const*, vertex_t>>{
+        std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
+      major_ptrs,
+      minor_ptrs,
+      counts,
+      std::nullopt,
+      do_expensive_check);
   }
 
   // 5. build a graph
@@ -469,11 +461,12 @@ coarsen_graph(
     std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
       handle,
       edgelists,
-      partition,
-      number_of_vertices,
-      number_of_edges,
-      graph_properties_t{graph_view.is_symmetric(), false},
-      segment_offsets),
+      graph_meta_t<vertex_t, edge_t, multi_gpu>{
+        meta.number_of_vertices,
+        meta.number_of_edges,
+        graph_properties_t{graph_view.is_symmetric(), false},
+        meta.partition,
+        meta.segment_offsets}),
     std::move(renumber_map_labels));
 }
 
@@ -519,7 +512,7 @@ coarsen_graph(
       thrust::unique(handle.get_thrust_policy(), unique_labels.begin(), unique_labels.end())),
     handle.get_stream());
 
-  auto [renumber_map_labels, segment_offsets] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+  auto [renumber_map_labels, meta] = renumber_edgelist<vertex_t, edge_t, multi_gpu>(
     handle,
     std::optional<std::tuple<vertex_t const*, vertex_t>>{
       std::make_tuple(unique_labels.data(), static_cast<vertex_t>(unique_labels.size()))},
@@ -542,9 +535,10 @@ coarsen_graph(
     std::make_unique<graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>>(
       handle,
       edgelist,
-      static_cast<vertex_t>(renumber_map_labels.size()),
-      graph_properties_t{graph_view.is_symmetric(), false},
-      segment_offsets),
+      graph_meta_t<vertex_t, edge_t, multi_gpu>{
+        static_cast<vertex_t>(renumber_map_labels.size()),
+        graph_properties_t{graph_view.is_symmetric(), false},
+        meta.segment_offsets}),
     std::move(renumber_map_labels));
 }
 
diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
index 58991f2477c..d60d14c8ac6 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
@@ -95,10 +95,7 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
   // 2. renumber
 
   rmm::device_uvector<vertex_t> renumber_map_labels(0, handle.get_stream());
-  cugraph::partition_t<vertex_t> partition{};
-  vertex_t number_of_vertices{};
-  edge_t number_of_edges{};
-  auto vertex_partition_segment_offsets = std::make_optional<std::vector<vertex_t>>(0);
+  renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
   {
     std::vector<vertex_t*> major_ptrs(col_comm_size);
     std::vector<vertex_t*> minor_ptrs(major_ptrs.size());
@@ -108,21 +105,16 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
       minor_ptrs[i] = (store_transposed ? edgelist_rows.begin() : edgelist_cols.begin()) +
                       edgelist_displacements[i];
     }
-    std::tie(renumber_map_labels,
-             partition,
-             number_of_vertices,
-             number_of_edges,
-             *vertex_partition_segment_offsets) =
-      cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-        handle,
-        local_vertex_span
-          ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-              (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))}
-          : std::nullopt,
-        major_ptrs,
-        minor_ptrs,
-        edgelist_edge_counts,
-        edgelist_intra_partition_segment_offsets);
+    std::tie(renumber_map_labels, meta) = cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      local_vertex_span
+        ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
+            (*local_vertex_span).data(), static_cast<vertex_t>((*local_vertex_span).size()))}
+        : std::nullopt,
+      major_ptrs,
+      minor_ptrs,
+      edgelist_edge_counts,
+      edgelist_intra_partition_segment_offsets);
   }
 
   // 3. create a graph
@@ -142,11 +134,11 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
       edgelists,
-      partition,
-      number_of_vertices,
-      number_of_edges,
-      graph_properties,
-      vertex_partition_segment_offsets),
+      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{meta.number_of_vertices,
+                                                         meta.number_of_edges,
+                                                         graph_properties,
+                                                         meta.partition,
+                                                         meta.segment_offsets}),
     std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
 }
 
@@ -171,17 +163,16 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     renumber ? std::make_optional<rmm::device_uvector<vertex_t>>(0, handle.get_stream())
              : std::nullopt;
   std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+  renumber_meta_t<vertex_t, edge_t, multi_gpu> meta{};
   if (renumber) {
-    segment_offsets = std::vector<vertex_t>{};
-    std::tie(*renumber_map_labels, *segment_offsets) =
-      cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
-        handle,
-        vertex_span ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
-                        (*vertex_span).data(), static_cast<vertex_t>((*vertex_span).size()))}
-                    : std::nullopt,
-        store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
-        store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
-        static_cast<edge_t>(edgelist_rows.size()));
+    std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist<vertex_t, edge_t, multi_gpu>(
+      handle,
+      vertex_span ? std::optional<std::tuple<vertex_t const*, vertex_t>>{std::make_tuple(
+                      (*vertex_span).data(), static_cast<vertex_t>((*vertex_span).size()))}
+                  : std::nullopt,
+      store_transposed ? edgelist_cols.data() : edgelist_rows.data(),
+      store_transposed ? edgelist_rows.data() : edgelist_cols.data(),
+      static_cast<edge_t>(edgelist_rows.size()));
   }
 
   vertex_t num_vertices{};
@@ -205,9 +196,10 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
         edgelist_weights ? std::optional<weight_t const*>{(*edgelist_weights).data()}
                          : std::nullopt,
         static_cast<edge_t>(edgelist_rows.size())},
-      num_vertices,
-      graph_properties,
-      std::optional<std::vector<vertex_t>>{segment_offsets}),
+      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{
+        num_vertices,
+        graph_properties,
+        renumber ? std::optional<std::vector<vertex_t>>{meta.segment_offsets} : std::nullopt}),
     std::move(renumber_map_labels));
 }
 
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index b226427d613..e54d69204ac 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -198,15 +198,11 @@ template <typename vertex_t,
 graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<multi_gpu>>::
   graph_t(raft::handle_t const& handle,
           std::vector<edgelist_t<vertex_t, edge_t, weight_t>> const& edgelists,
-          partition_t<vertex_t> const& partition,
-          vertex_t number_of_vertices,
-          edge_t number_of_edges,
-          graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
-      handle, number_of_vertices, number_of_edges, properties),
-    partition_(partition)
+      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
+    partition_(meta.partition)
 {
   // cheap error checks
 
@@ -225,15 +221,16 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   CUGRAPH_EXPECTS(edgelists.size() == static_cast<size_t>(col_comm_size),
                   "Invalid input argument: errneous edgelists.size().");
   CUGRAPH_EXPECTS(
-    !segment_offsets.has_value() ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)) ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
-    "Invalid input argument: segment_offsets.size() returns an invalid value.");
+    !(meta.segment_offsets).has_value() ||
+      ((*(meta.segment_offsets)).size() ==
+       (detail::num_sparse_segments_per_vertex_partition + 1)) ||
+      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 2)),
+    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
 
   auto is_weighted = edgelists[0].p_edge_weights.has_value();
   auto use_dcs =
-    segment_offsets
-      ? ((*segment_offsets).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
+    meta.segment_offsets
+      ? ((*(meta.segment_offsets)).size() > (detail::num_sparse_segments_per_vertex_partition + 1))
       : false;
 
   CUGRAPH_EXPECTS(
@@ -255,8 +252,8 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   if (do_expensive_check) {
     edge_t number_of_local_edges_sum{};
     for (size_t i = 0; i < edgelists.size(); ++i) {
-      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
-      auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
+      auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
+      auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
 
       number_of_local_edges_sum += edgelists[i].number_of_edges;
 
@@ -273,23 +270,24 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     }
     number_of_local_edges_sum =
       host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream_view.value());
-    CUGRAPH_EXPECTS(
-      number_of_local_edges_sum == this->get_number_of_edges(),
-      "Invalid input argument: the sum of local edge counts does not match with number_of_edges.");
+    CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
+                    "Invalid input argument: the sum of local edge counts does not match with "
+                    "meta.number_of_edges.");
 
     CUGRAPH_EXPECTS(
-      partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-      "Invalid input argument: vertex partition should cover [0, number_of_vertices).");
+      partition_.get_vertex_partition_last(comm_size - 1) == meta.number_of_vertices,
+      "Invalid input argument: vertex partition should cover [0, meta.number_of_vertices).");
   }
 
   // aggregate segment_offsets
 
-  if (segment_offsets) {
+  if (meta.segment_offsets) {
     // FIXME: we need to add host_allgather
-    rmm::device_uvector<vertex_t> d_segment_offsets((*segment_offsets).size(), default_stream_view);
+    rmm::device_uvector<vertex_t> d_segment_offsets((*(meta.segment_offsets)).size(),
+                                                    default_stream_view);
     raft::update_device(d_segment_offsets.data(),
-                        (*segment_offsets).data(),
-                        (*segment_offsets).size(),
+                        (*(meta.segment_offsets)).data(),
+                        (*(meta.segment_offsets)).size(),
                         default_stream_view.value());
     rmm::device_uvector<vertex_t> d_aggregate_segment_offsets(
       col_comm_size * d_segment_offsets.size(), default_stream_view);
@@ -325,12 +323,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     (*adj_matrix_partition_dcs_nzd_vertex_counts_).reserve(edgelists.size());
   }
   for (size_t i = 0; i < edgelists.size(); ++i) {
-    auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
-    auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
+    auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
+    auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
     auto major_hypersparse_first =
       use_dcs ? std::optional<vertex_t>{major_first +
                                         (*adj_matrix_partition_segment_offsets_)
-                                          [(*segment_offsets).size() * i +
+                                          [(*(meta.segment_offsets)).size() * i +
                                            detail::num_sparse_segments_per_vertex_partition]}
               : std::nullopt;
     auto [offsets, indices, weights, dcs_nzd_vertices] =
@@ -371,15 +369,13 @@ template <typename vertex_t,
 graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_t<!multi_gpu>>::
   graph_t(raft::handle_t const& handle,
           edgelist_t<vertex_t, edge_t, weight_t> const& edgelist,
-          vertex_t number_of_vertices,
-          graph_properties_t properties,
-          std::optional<std::vector<vertex_t>> const& segment_offsets,
+          graph_meta_t<vertex_t, edge_t, multi_gpu> meta,
           bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
-      handle, number_of_vertices, edgelist.number_of_edges, properties),
+      handle, meta.number_of_vertices, edgelist.number_of_edges, meta.properties),
     offsets_(rmm::device_uvector<edge_t>(0, handle.get_stream_view())),
     indices_(rmm::device_uvector<vertex_t>(0, handle.get_stream_view())),
-    segment_offsets_(segment_offsets)
+    segment_offsets_(meta.segment_offsets)
 {
   // cheap error checks
 
@@ -397,9 +393,9 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     "std::nullopt nor nullptr if weighted and edgelist.number_of_edges > 0.");
 
   CUGRAPH_EXPECTS(
-    !segment_offsets.has_value() ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
-    "Invalid input argument: segment_offsets.size() returns an invalid value.");
+    !segment_offsets_.has_value() ||
+      ((*segment_offsets_).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
+    "Invalid input argument: (*(meta.segment_offsets)).size() returns an invalid value.");
 
   // optional expensive checks (part 1/2)
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 156d86e5e76..368573d4a91 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -163,14 +163,10 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
     std::optional<std::vector<weight_t const*>> const& adj_matrix_partition_weights,
     std::optional<std::vector<vertex_t const*>> const& adj_matrix_partition_dcs_nzd_vertices,
     std::optional<std::vector<vertex_t>> const& adj_matrix_partition_dcs_nzd_vertex_counts,
-    partition_t<vertex_t> const& partition,
-    vertex_t number_of_vertices,
-    edge_t number_of_edges,
-    graph_properties_t properties,
-    std::optional<std::vector<vertex_t>> const& adj_matrix_partition_segment_offsets,
+    graph_view_meta_t<vertex_t, edge_t, multi_gpu> meta,
     bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
-      handle, number_of_vertices, number_of_edges, properties),
+      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
     adj_matrix_partition_offsets_(adj_matrix_partition_offsets),
     adj_matrix_partition_indices_(adj_matrix_partition_indices),
     adj_matrix_partition_weights_(adj_matrix_partition_weights),
@@ -179,11 +175,11 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
     adj_matrix_partition_number_of_edges_(
       update_adj_matrix_partition_edge_counts(adj_matrix_partition_offsets,
                                               adj_matrix_partition_dcs_nzd_vertex_counts,
-                                              partition,
-                                              adj_matrix_partition_segment_offsets,
+                                              meta.partition,
+                                              meta.adj_matrix_partition_segment_offsets,
                                               handle.get_stream())),
-    partition_(partition),
-    adj_matrix_partition_segment_offsets_(adj_matrix_partition_segment_offsets)
+    partition_(meta.partition),
+    adj_matrix_partition_segment_offsets_(meta.adj_matrix_partition_segment_offsets)
 {
   // cheap error checks
 
@@ -221,8 +217,8 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                   "Internal Error: erroneous adj_matrix_partition_offsets.size().");
 
   CUGRAPH_EXPECTS(
-    !adj_matrix_partition_segment_offsets.has_value() ||
-      ((*adj_matrix_partition_segment_offsets).size() ==
+    !(meta.adj_matrix_partition_segment_offsets.has_value()) ||
+      ((*(meta.adj_matrix_partition_segment_offsets)).size() ==
        col_comm_size * (detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 2 : 1))),
     "Internal Error: invalid adj_matrix_partition_segment_offsets.size().");
 
@@ -240,12 +236,12 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
 
     edge_t number_of_local_edges_sum{};
     for (size_t i = 0; i < adj_matrix_partition_offsets.size(); ++i) {
-      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
-      auto [minor_first, minor_last] = partition.get_matrix_partition_minor_range();
+      auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
+      auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
       auto offset_array_size         = major_last - major_first + 1;
       if (use_dcs) {
         auto major_hypersparse_first =
-          major_first + (*adj_matrix_partition_segment_offsets)
+          major_first + (*(meta.adj_matrix_partition_segment_offsets))
                           [(detail::num_sparse_segments_per_vertex_partition + 2) * i +
                            detail::num_sparse_segments_per_vertex_partition];
         offset_array_size = major_hypersparse_first - major_first +
@@ -277,42 +273,45 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                     "Internal Error: the sum of local edges counts does not match with "
                     "number_of_local_edges.");
 
-    if (adj_matrix_partition_segment_offsets) {
+    if (meta.adj_matrix_partition_segment_offsets) {
       auto degrees = detail::compute_major_degrees(handle,
                                                    adj_matrix_partition_offsets,
                                                    adj_matrix_partition_dcs_nzd_vertices,
                                                    adj_matrix_partition_dcs_nzd_vertex_counts,
-                                                   partition,
-                                                   adj_matrix_partition_segment_offsets);
+                                                   partition_,
+                                                   meta.adj_matrix_partition_segment_offsets);
       CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
                                         degrees.begin(),
                                         degrees.end(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid Invalid input argument: adj_matrix_partition_segment_offsets are "
-                      "provided, but degrees are not in descending order.");
+                      "Invalid Invalid input argument: meta.adj_matrix_partition_segment_offsets "
+                      "are provided, but degrees are not in descending order.");
 
       auto num_segments_per_vertex_partition =
         detail::num_sparse_segments_per_vertex_partition + (use_dcs ? 1 : 0);
       for (int i = 0; i < col_comm_size; ++i) {
-        CUGRAPH_EXPECTS(std::is_sorted((*adj_matrix_partition_segment_offsets).begin() +
+        CUGRAPH_EXPECTS(std::is_sorted((*(meta.adj_matrix_partition_segment_offsets)).begin() +
                                          (num_segments_per_vertex_partition + 1) * i,
-                                       (*adj_matrix_partition_segment_offsets).begin() +
+                                       (*(meta.adj_matrix_partition_segment_offsets)).begin() +
                                          (num_segments_per_vertex_partition + 1) * (i + 1)),
-                        "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
+                        "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets.");
         CUGRAPH_EXPECTS(
-          (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i] == 0,
-          "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
+          (*(meta.adj_matrix_partition_segment_offsets))[(num_segments_per_vertex_partition + 1) *
+                                                         i] == 0,
+          "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets.");
         auto vertex_partition_idx = row_comm_size * i + row_comm_rank;
         CUGRAPH_EXPECTS(
-          (*adj_matrix_partition_segment_offsets)[(num_segments_per_vertex_partition + 1) * i +
-                                                  num_segments_per_vertex_partition] ==
-            partition.get_vertex_partition_size(vertex_partition_idx),
-          "Internal Error: erroneous adj_matrix_partition_segment_offsets.");
+          (*(meta
+               .adj_matrix_partition_segment_offsets))[(num_segments_per_vertex_partition + 1) * i +
+                                                       num_segments_per_vertex_partition] ==
+            partition_.get_vertex_partition_size(vertex_partition_idx),
+          "Internal Error: erroneous meta.adj_matrix_partition_segment_offsets.");
       }
     }
 
-    CUGRAPH_EXPECTS(partition.get_vertex_partition_last(comm_size - 1) == number_of_vertices,
-                    "Internal Error: vertex partition should cover [0, number_of_vertices).");
+    CUGRAPH_EXPECTS(
+      partition_.get_vertex_partition_last(comm_size - 1) == this->get_number_of_vertices(),
+      "Internal Error: vertex partition should cover [0, number_of_vertices).");
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
     if (this->is_symmetric()) {}
@@ -327,34 +326,31 @@ template <typename vertex_t,
           typename weight_t,
           bool store_transposed,
           bool multi_gpu>
-graph_view_t<vertex_t,
-             edge_t,
-             weight_t,
-             store_transposed,
-             multi_gpu,
-             std::enable_if_t<!multi_gpu>>::graph_view_t(raft::handle_t const& handle,
-                                                         edge_t const* offsets,
-                                                         vertex_t const* indices,
-                                                         std::optional<weight_t const*> weights,
-                                                         vertex_t number_of_vertices,
-                                                         edge_t number_of_edges,
-                                                         graph_properties_t properties,
-                                                         std::optional<std::vector<vertex_t>> const&
-                                                           segment_offsets,
-                                                         bool do_expensive_check)
+graph_view_t<
+  vertex_t,
+  edge_t,
+  weight_t,
+  store_transposed,
+  multi_gpu,
+  std::enable_if_t<!multi_gpu>>::graph_view_t(raft::handle_t const& handle,
+                                              edge_t const* offsets,
+                                              vertex_t const* indices,
+                                              std::optional<weight_t const*> weights,
+                                              graph_view_meta_t<vertex_t, edge_t, multi_gpu> meta,
+                                              bool do_expensive_check)
   : detail::graph_base_t<vertex_t, edge_t, weight_t>(
-      handle, number_of_vertices, number_of_edges, properties),
+      handle, meta.number_of_vertices, meta.number_of_edges, meta.properties),
     offsets_(offsets),
     indices_(indices),
     weights_(weights),
-    segment_offsets_(segment_offsets)
+    segment_offsets_(meta.segment_offsets)
 {
   // cheap error checks
 
   CUGRAPH_EXPECTS(
-    !segment_offsets.has_value() ||
-      ((*segment_offsets).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
-    "Internal Error: segment_offsets.size() returns an invalid value.");
+    !(meta.segment_offsets).has_value() ||
+      ((*(meta.segment_offsets)).size() == (detail::num_sparse_segments_per_vertex_partition + 1)),
+    "Internal Error: (*(meta.segment_offsets)).size() returns an invalid value.");
 
   // optional expensive checks
 
@@ -374,20 +370,22 @@ graph_view_t<vertex_t,
                        out_of_range_t<vertex_t>{0, this->get_number_of_vertices()}) == 0,
       "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
 
-    if (segment_offsets) {
-      auto degrees = detail::compute_major_degrees(handle, offsets, number_of_vertices);
+    if (meta.segment_offsets) {
+      auto degrees = detail::compute_major_degrees(handle, offsets, this->get_number_of_vertices());
       CUGRAPH_EXPECTS(thrust::is_sorted(rmm::exec_policy(default_stream_view),
                                         degrees.begin(),
                                         degrees.end(),
                                         thrust::greater<edge_t>{}),
-                      "Invalid Invalid input argument: segment_offsets are provided, but degrees "
+                      "Invalid Invalid input argument: meta.segment_offsets is valid, but degrees "
                       "are not in descending order.");
 
-      CUGRAPH_EXPECTS(std::is_sorted((*segment_offsets).begin(), (*segment_offsets).end()),
-                      "Internal Error: erroneous segment_offsets.");
-      CUGRAPH_EXPECTS((*segment_offsets)[0] == 0, "Invalid input argument segment_offsets.");
-      CUGRAPH_EXPECTS((*segment_offsets).back() == this->get_number_of_vertices(),
-                      "Invalid input argument: segment_offsets.");
+      CUGRAPH_EXPECTS(
+        std::is_sorted((*(meta.segment_offsets)).begin(), (*(meta.segment_offsets)).end()),
+        "Internal Error: erroneous meta.segment_offsets.");
+      CUGRAPH_EXPECTS((*(meta.segment_offsets))[0] == 0,
+                      "Invalid input argument meta.segment_offsets.");
+      CUGRAPH_EXPECTS((*(meta.segment_offsets)).back() == this->get_number_of_vertices(),
+                      "Invalid input argument: meta.segment_offsets.");
     }
 
     // FIXME: check for symmetricity may better be implemetned with transpose().
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 6efbf13e41b..8104db8eebc 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -605,12 +605,9 @@ void expensive_check_edgelist(
 }  // namespace detail
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::enable_if_t<multi_gpu,
-                 std::tuple<rmm::device_uvector<vertex_t>,
-                            partition_t<vertex_t>,
-                            vertex_t,
-                            edge_t,
-                            std::vector<vertex_t>>>
+std::enable_if_t<
+  multi_gpu,
+  std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(
   raft::handle_t const& handle,
   std::optional<std::tuple<vertex_t const*, vertex_t>> local_vertex_span,
@@ -829,15 +826,16 @@ renumber_edgelist(
   comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-  return std::make_tuple(std::move(renumber_map_labels),
-                         partition,
-                         number_of_vertices,
-                         number_of_edges,
-                         vertex_partition_segment_offsets);
+  return std::make_tuple(
+    std::move(renumber_map_labels),
+    renumber_meta_t<vertex_t, edge_t, multi_gpu>{
+      number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets});
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
-std::enable_if_t<!multi_gpu, std::tuple<rmm::device_uvector<vertex_t>, std::vector<vertex_t>>>
+std::enable_if_t<
+  !multi_gpu,
+  std::tuple<rmm::device_uvector<vertex_t>, renumber_meta_t<vertex_t, edge_t, multi_gpu>>>
 renumber_edgelist(raft::handle_t const& handle,
                   std::optional<std::tuple<vertex_t const*, vertex_t>> vertex_span,
                   vertex_t* edgelist_major_vertices /* [INOUT] */,
@@ -886,7 +884,8 @@ renumber_edgelist(raft::handle_t const& handle,
   renumber_map.find(
     edgelist_minor_vertices, edgelist_minor_vertices + num_edgelist_edges, edgelist_minor_vertices);
 
-  return std::make_tuple(std::move(renumber_map_labels), segment_offsets);
+  return std::make_tuple(std::move(renumber_map_labels),
+                         renumber_meta_t<vertex_t, edge_t, multi_gpu>{segment_offsets});
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/renumber_edgelist_mg.cu b/cpp/src/structure/renumber_edgelist_mg.cu
index 03ba230e598..4e9f37e10bb 100644
--- a/cpp/src/structure/renumber_edgelist_mg.cu
+++ b/cpp/src/structure/renumber_edgelist_mg.cu
@@ -19,40 +19,34 @@ namespace cugraph {
 
 // MG instantiation
 
-template std::
-  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int32_t, std::vector<int32_t>>
-  renumber_edgelist<int32_t, int32_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
-    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int32_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int32_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int32_t, true>>
+renumber_edgelist<int32_t, int32_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int32_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int32_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
-template std::
-  tuple<rmm::device_uvector<int32_t>, partition_t<int32_t>, int32_t, int64_t, std::vector<int32_t>>
-  renumber_edgelist<int32_t, int64_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
-    std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int64_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int64_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int64_t, true>>
+renumber_edgelist<int32_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int32_t const*, int32_t>> optional_local_vertex_span,
+  std::vector<int32_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int32_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
-template std::
-  tuple<rmm::device_uvector<int64_t>, partition_t<int64_t>, int64_t, int64_t, std::vector<int64_t>>
-  renumber_edgelist<int64_t, int64_t, true>(
-    raft::handle_t const& handle,
-    std::optional<std::tuple<int64_t const*, int64_t>> optional_local_vertex_span,
-    std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
-    std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
-    std::vector<int64_t> const& edgelist_edge_counts,
-    std::optional<std::vector<std::vector<int64_t>>> const&
-      edgelist_intra_partition_segment_offsets,
-    bool do_expensive_check);
+template std::tuple<rmm::device_uvector<int64_t>, renumber_meta_t<int64_t, int64_t, true>>
+renumber_edgelist<int64_t, int64_t, true>(
+  raft::handle_t const& handle,
+  std::optional<std::tuple<int64_t const*, int64_t>> optional_local_vertex_span,
+  std::vector<int64_t*> const& edgelist_major_vertices /* [INOUT] */,
+  std::vector<int64_t*> const& edgelist_minor_vertices /* [INOUT] */,
+  std::vector<int64_t> const& edgelist_edge_counts,
+  std::optional<std::vector<std::vector<int64_t>>> const& edgelist_intra_partition_segment_offsets,
+  bool do_expensive_check);
 
 }  // namespace cugraph
diff --git a/cpp/src/structure/renumber_edgelist_sg.cu b/cpp/src/structure/renumber_edgelist_sg.cu
index e8409cdfe9f..3bb25d74b2e 100644
--- a/cpp/src/structure/renumber_edgelist_sg.cu
+++ b/cpp/src/structure/renumber_edgelist_sg.cu
@@ -19,7 +19,7 @@ namespace cugraph {
 
 // SG instantiation
 
-template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
+template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int32_t, false>>
 renumber_edgelist<int32_t, int32_t, false>(
   raft::handle_t const& handle,
   std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
@@ -28,7 +28,7 @@ renumber_edgelist<int32_t, int32_t, false>(
   int32_t num_edgelist_edges,
   bool do_expensive_check);
 
-template std::tuple<rmm::device_uvector<int32_t>, std::vector<int32_t>>
+template std::tuple<rmm::device_uvector<int32_t>, renumber_meta_t<int32_t, int64_t, false>>
 renumber_edgelist<int32_t, int64_t, false>(
   raft::handle_t const& handle,
   std::optional<std::tuple<int32_t const*, int32_t>> optional_vertex_span,
@@ -37,7 +37,7 @@ renumber_edgelist<int32_t, int64_t, false>(
   int64_t num_edgelist_edges,
   bool do_expensive_check);
 
-template std::tuple<rmm::device_uvector<int64_t>, std::vector<int64_t>>
+template std::tuple<rmm::device_uvector<int64_t>, renumber_meta_t<int64_t, int64_t, false>>
 renumber_edgelist<int64_t, int64_t, false>(
   raft::handle_t const& handle,
   std::optional<std::tuple<int64_t const*, int64_t>> optional_vertex_span,
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 25d42ec1f22..bee78fb9952 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -159,16 +159,17 @@ std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>> crea
   return std::make_unique<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
     edgelists,
-    partition,
-    static_cast<vertex_t>(graph_container.num_global_vertices),
-    static_cast<edge_t>(graph_container.num_global_edges),
-    graph_container.graph_props,
-    graph_container.segment_offsets != nullptr
-      ? std::make_optional<std::vector<vertex_t>>(
-          static_cast<vertex_t const*>(graph_container.segment_offsets),
-          static_cast<vertex_t const*>(graph_container.segment_offsets) +
-            graph_container.num_segments + 1)
-      : std::nullopt,
+    graph_meta_t<vertex_t, edge_t, multi_gpu>{
+      static_cast<vertex_t>(graph_container.num_global_vertices),
+      static_cast<edge_t>(graph_container.num_global_edges),
+      graph_container.graph_props,
+      partition,
+      graph_container.segment_offsets != nullptr
+        ? std::make_optional<std::vector<vertex_t>>(
+            static_cast<vertex_t const*>(graph_container.segment_offsets),
+            static_cast<vertex_t const*>(graph_container.segment_offsets) +
+              graph_container.num_segments + 1)
+        : std::nullopt},
     graph_container.do_expensive_check);
 }
 
@@ -191,14 +192,15 @@ std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>> crea
   return std::make_unique<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>>(
     handle,
     edgelist,
-    static_cast<vertex_t>(graph_container.num_global_vertices),
-    graph_container.graph_props,
-    graph_container.segment_offsets != nullptr
-      ? std::make_optional<std::vector<vertex_t>>(
-          static_cast<vertex_t const*>(graph_container.segment_offsets),
-          static_cast<vertex_t const*>(graph_container.segment_offsets) +
-            graph_container.num_segments + 1)
-      : std::nullopt,
+    graph_meta_t<vertex_t, edge_t, multi_gpu>{
+      static_cast<vertex_t>(graph_container.num_global_vertices),
+      graph_container.graph_props,
+      graph_container.segment_offsets != nullptr
+        ? std::make_optional<std::vector<vertex_t>>(
+            static_cast<vertex_t const*>(graph_container.segment_offsets),
+            static_cast<vertex_t const*>(graph_container.segment_offsets) +
+              graph_container.num_segments + 1)
+        : std::nullopt},
     graph_container.do_expensive_check);
 }
 
@@ -1212,20 +1214,16 @@ std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
       minor_ptrs[i] = shuffled_edgelist_minor_vertices + displacements[i];
     }
 
-    std::tie(p_ret->get_dv(),
-             p_ret->get_partition(),
-             p_ret->get_num_vertices(),
-             p_ret->get_num_edges(),
-             p_ret->get_segment_offsets()) =
-      cugraph::renumber_edgelist<vertex_t, edge_t, true>(handle,
-                                                         std::nullopt,
-                                                         major_ptrs,
-                                                         minor_ptrs,
-                                                         edge_counts,
-                                                         std::nullopt,
-                                                         do_expensive_check);
+    cugraph::renumber_meta_t<vertex_t, edge_t, true> meta{};
+    std::tie(p_ret->get_dv(), meta) = cugraph::renumber_edgelist<vertex_t, edge_t, true>(
+      handle, std::nullopt, major_ptrs, minor_ptrs, edge_counts, std::nullopt, do_expensive_check);
+    p_ret->get_num_vertices()    = meta.number_of_vertices;
+    p_ret->get_num_edges()       = meta.number_of_edges;
+    p_ret->get_partition()       = meta.partition;
+    p_ret->get_segment_offsets() = meta.segment_offsets;
   } else {
-    std::tie(p_ret->get_dv(), p_ret->get_segment_offsets()) =
+    cugraph::renumber_meta_t<vertex_t, edge_t, false> meta{};
+    std::tie(p_ret->get_dv(), meta) =
       cugraph::renumber_edgelist<vertex_t, edge_t, false>(handle,
                                                           std::nullopt,
                                                           shuffled_edgelist_major_vertices,
@@ -1233,10 +1231,10 @@ std::unique_ptr<renum_tuple_t<vertex_t, edge_t>> call_renumber(
                                                           edge_counts[0],
                                                           do_expensive_check);
 
-    p_ret->get_partition() = cugraph::partition_t<vertex_t>{};  // dummy
-
-    p_ret->get_num_vertices() = static_cast<vertex_t>(p_ret->get_dv().size());
-    p_ret->get_num_edges()    = edge_counts[0];
+    p_ret->get_num_vertices()    = static_cast<vertex_t>(p_ret->get_dv().size());
+    p_ret->get_num_edges()       = edge_counts[0];
+    p_ret->get_partition()       = cugraph::partition_t<vertex_t>{};  // dummy
+    p_ret->get_segment_offsets() = meta.segment_offsets;
   }
 
   return p_ret;  // RVO-ed (copy ellision)
diff --git a/cpp/tests/community/mg_louvain_helper.cu b/cpp/tests/community/mg_louvain_helper.cu
index d52d8657e2a..0bb0d801229 100644
--- a/cpp/tests/community/mg_louvain_helper.cu
+++ b/cpp/tests/community/mg_louvain_helper.cu
@@ -254,9 +254,10 @@ coarsen_graph(
   return std::make_unique<cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>>(
     handle,
     edgelist,
-    new_number_of_vertices,
-    cugraph::graph_properties_t{graph_view.is_symmetric(), false},
-    std::nullopt);
+    cugraph::graph_meta_t<vertex_t, edge_t, false>{
+      new_number_of_vertices,
+      cugraph::graph_properties_t{graph_view.is_symmetric(), false},
+      std::nullopt});
 }
 
 // explicit instantiation
diff --git a/cpp/tests/structure/graph_test.cpp b/cpp/tests/structure/graph_test.cpp
index 614ef2528c5..7966a4d141c 100644
--- a/cpp/tests/structure/graph_test.cpp
+++ b/cpp/tests/structure/graph_test.cpp
@@ -132,9 +132,8 @@ class Tests_Graph : public ::testing::TestWithParam<Graph_Usecase> {
     auto graph = cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, false>(
       handle,
       edgelist,
-      number_of_vertices,
-      cugraph::graph_properties_t{is_symmetric, false},
-      std::nullopt,
+      cugraph::graph_meta_t<vertex_t, edge_t, false>{
+        number_of_vertices, cugraph::graph_properties_t{is_symmetric, false}, std::nullopt},
       true);
 
     auto graph_view = graph.view();
diff --git a/cpp/tests/visitors/bfs_test.cpp b/cpp/tests/visitors/bfs_test.cpp
index e6337cd459f..1b971ea84e0 100644
--- a/cpp/tests/visitors/bfs_test.cpp
+++ b/cpp/tests/visitors/bfs_test.cpp
@@ -137,7 +137,8 @@ class Tests_BFS : public ::testing::TestWithParam<BFS_Usecase> {
     bool sorted{false};
     bool check{false};
 
-    erased_pack_t ep_graph{&handle, &edgelist, &num_vertices, &graph_props, &sorted, &check};
+    cugraph::graph_meta_t<vertex_t, edge_t, false> meta{num_vertices, graph_props, std::nullopt};
+    erased_pack_t ep_graph{&handle, &edgelist, &meta, &check};
 
     DTypes vertex_tid = reverse_dmap_t<vertex_t>::type_id;
     DTypes edge_tid   = reverse_dmap_t<edge_t>::type_id;

From 93e5839e82f9f9eeda31034e5db0f71f2eee5ea1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 10 Sep 2021 15:48:38 -0400
Subject: [PATCH 34/57] cosmetics

---
 cpp/include/cugraph/graph_functions.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index e11a1f8a2dc..b071f9eada4 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -29,8 +29,7 @@
 namespace cugraph {
 
 template <typename vertex_t, typename edge_t, bool multi_gpu, typename Enable = void>
-struct renumber_meta_t {
-};
+struct renumber_meta_t;
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
 struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {

From d6a95c5811f5b517a52a060f2dec4deb7028c673 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 10 Sep 2021 17:55:42 -0400
Subject: [PATCH 35/57] additional fixes after merge

---
 cpp/include/cugraph/graph.hpp                | 17 +++++++++++++++
 cpp/include/cugraph/graph_functions.hpp      |  3 +++
 cpp/include/cugraph/graph_view.hpp           |  5 +++++
 cpp/src/structure/graph_impl.cuh             | 22 ++++++++++----------
 cpp/src/structure/renumber_edgelist_impl.cuh |  8 +++++--
 5 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index e679b1eafd6..22629c63458 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -51,6 +51,9 @@ struct graph_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>> {
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> segment_offsets{std::nullopt};
+
+  vertex_t num_local_unique_edge_rows{};
+  vertex_t num_local_unique_edge_cols{};
 };
 
 // single-GPU version
@@ -136,6 +139,20 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
         this->get_graph_properties(),
         partition_,
         adj_matrix_partition_segment_offsets_,
+        local_sorted_unique_edge_rows_
+          ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_rows_).data()}
+          : std::nullopt,
+        local_sorted_unique_edge_rows_
+          ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_rows_).data() +
+                                           (*local_sorted_unique_edge_rows_).size()}
+          : std::nullopt,
+        local_sorted_unique_edge_cols_
+          ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_cols_).data()}
+          : std::nullopt,
+        local_sorted_unique_edge_cols_
+          ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_cols_).data() +
+                                           (*local_sorted_unique_edge_cols_).size()}
+          : std::nullopt,
       },
       false);
   }
diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp
index 7277008cc63..83cfc6d831b 100644
--- a/cpp/include/cugraph/graph_functions.hpp
+++ b/cpp/include/cugraph/graph_functions.hpp
@@ -37,6 +37,9 @@ struct renumber_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu>>
   edge_t number_of_edges{};
   partition_t<vertex_t> partition{};
   std::vector<vertex_t> segment_offsets{};
+
+  vertex_t num_local_unique_edge_majors{};
+  vertex_t num_local_unique_edge_minors{};
 };
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index a27aa39356e..804b765ae4b 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -305,6 +305,11 @@ struct graph_view_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu
 
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets{};
+
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{};
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{};
 };
 
 // single-GPU version
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 0f0c2cf76ee..3f6cc1a60fd 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -282,7 +282,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                          major_first, major_last, minor_first, minor_last}) == 0,
                       "Invalid input argument: edgelists[] have out-of-range values.");
     }
-    number_of_local_edges_sum =
+    auto number_of_local_edges_sum =
       host_scalar_allreduce(comm, number_of_local_edges, default_stream_view.value());
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
                     "Invalid input argument: the sum of local edge counts does not match with "
@@ -314,14 +314,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     auto num_local_unique_edge_minors = static_cast<vertex_t>(thrust::distance(
       minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end())));
     if constexpr (store_transposed) {
-      CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_cols,
+      CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_cols,
                       "Invalid input argument: num_unique_edge_cols is erroneous.");
-      CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_rows,
+      CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_rows,
                       "Invalid input argument: num_unique_edge_rows is erroneous.");
     } else {
-      CUGRAPH_EXPECTS(num_local_unique_edge_majors == num_local_unique_edge_rows,
+      CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_rows,
                       "Invalid input argument: num_unique_edge_rows is erroneous.");
-      CUGRAPH_EXPECTS(num_local_unique_edge_minors == num_local_unique_edge_cols,
+      CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_cols,
                       "Invalid input argument: num_unique_edge_cols is erroneous.");
     }
   }
@@ -401,15 +401,15 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   // support storing edge row/column properties in (key, value) pairs.
 
   auto num_local_unique_edge_majors =
-    store_transposed ? num_local_unique_edge_cols : num_local_unique_edge_rows;
+    store_transposed ? meta.num_local_unique_edge_cols : meta.num_local_unique_edge_rows;
   auto num_local_unique_edge_minors =
-    store_transposed ? num_local_unique_edge_rows : num_local_unique_edge_cols;
+    store_transposed ? meta.num_local_unique_edge_rows : meta.num_local_unique_edge_cols;
 
   vertex_t aggregate_major_size{0};
-  for (size_t i = 0; i < partition.get_number_of_matrix_partitions(); ++i) {
-    aggregate_major_size += partition.get_matrix_partition_major_size(i);
+  for (size_t i = 0; i < partition_.get_number_of_matrix_partitions(); ++i) {
+    aggregate_major_size += partition_.get_matrix_partition_major_size(i);
   }
-  auto minor_size                      = partition.get_matrix_partition_minor_size();
+  auto minor_size                      = partition_.get_matrix_partition_minor_size();
   auto max_major_properties_fill_ratio = host_scalar_allreduce(
     comm,
     static_cast<double>(num_local_unique_edge_majors) / static_cast<double>(aggregate_major_size),
@@ -424,7 +424,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                                                   handle.get_stream());
     size_t cur_size{0};
     for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
-      auto [major_first, major_last] = partition.get_matrix_partition_major_range(i);
+      auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
       cur_size += thrust::distance(
         local_sorted_unique_edge_majors.data() + cur_size,
         thrust::copy_if(
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index f089f803d33..56aca7725ea 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -839,8 +839,12 @@ renumber_edgelist(
 
   return std::make_tuple(
     std::move(renumber_map_labels),
-    renumber_meta_t<vertex_t, edge_t, multi_gpu>{
-      number_of_vertices, number_of_edges, partition, vertex_partition_segment_offsets});
+    renumber_meta_t<vertex_t, edge_t, multi_gpu>{number_of_vertices,
+                                                 number_of_edges,
+                                                 partition,
+                                                 vertex_partition_segment_offsets,
+                                                 num_unique_edge_majors,
+                                                 num_unique_edge_minors});
 }
 
 template <typename vertex_t, typename edge_t, bool multi_gpu>

From 7d3a5ed7a8ba6089a92e62c917f130670e2486c3 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 14 Sep 2021 12:44:23 -0400
Subject: [PATCH 36/57] merge major&minor_properties

---
 .../cugraph/prims/row_col_properties.cuh      | 200 +++++-------------
 cpp/src/community/louvain.cuh                 |  20 +-
 .../weakly_connected_components_impl.cuh      |   2 +-
 cpp/src/structure/coarsen_graph_impl.cuh      |   2 +-
 cpp/src/traversal/sssp_impl.cuh               |   2 +-
 5 files changed, 62 insertions(+), 164 deletions(-)

diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index 3aa14a2e859..6259a54b4fc 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -36,50 +36,41 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename ValueIterator>
-class major_properties_device_view_t {
+class row_col_properties_device_view_t {
  public:
   using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
 
-  major_properties_device_view_t() = default;
+  row_col_properties_device_view_t() = default;
 
-  major_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {}
-
-  void add_offset(vertex_t offset) { value_first_ += offset; }
-
-  ValueIterator value_data() const { return value_first_; }
-
-  __device__ ValueIterator get_iter(vertex_t offset) const { return value_first_ + offset; }
-  __device__ value_type get(vertex_t offset) const { return *get_iter(offset); }
-
- private:
-  ValueIterator value_first_{};
-};
-
-template <typename vertex_t, typename ValueIterator>
-class minor_properties_device_view_t {
- public:
-  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
-
-  minor_properties_device_view_t() = default;
-
-  minor_properties_device_view_t(ValueIterator value_first)
-    : key_first_(thrust::nullopt), key_last_(thrust::nullopt), value_first_(value_first)
+  row_col_properties_device_view_t(ValueIterator value_first)
+    : key_first_(thrust::nullopt), key_last_(thrust::nullopt), key_offset_(0), value_first_(value_first)
   {
   }
 
-  minor_properties_device_view_t(vertex_t const* key_first,
-                                 vertex_t const* key_last,
-                                 ValueIterator value_first)
-    : key_first_(key_first), key_last_(key_last), value_first_(value_first)
+  row_col_properties_device_view_t(vertex_t const* key_first,
+                                   vertex_t const* key_last,
+                                   ValueIterator value_first)
+    : key_first_(key_first), key_last_(key_last), key_offset_(0), value_first_(value_first)
   {
   }
 
+  void add_offset(vertex_t offset) {
+    if (key_first_) {
+      *key_offset_ += offset;
+    }
+    else {
+      value_first_ += offset;
+    }
+  }
+
+  ValueIterator value_data() const { return value_first_; }
+
   __device__ ValueIterator get_iter(vertex_t offset) const
   {
     auto value_offset = offset;
     if (key_first_) {
-      auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset);
-      assert((it != *key_last_) && (*it == offset));
+      auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset + *key_offset_);
+      assert((it != *key_last_) && (*it == (offset + *key_offset_)));
       value_offset = static_cast<vertex_t>(thrust::distance(*key_first_, it));
     }
     return value_first_ + value_offset;
@@ -90,66 +81,31 @@ class minor_properties_device_view_t {
  private:
   thrust::optional<vertex_t const*> key_first_{thrust::nullopt};
   thrust::optional<vertex_t const*> key_last_{thrust::nullopt};
+  thrust::optional<vertex_t> key_offset_{0};
 
   ValueIterator value_first_{};
 };
 
 template <typename vertex_t, typename T>
-class major_properties_t {
- public:
-  major_properties_t() : buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) {}
-
-  major_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
-    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
-  {
-  }
-
-  void fill(T value, rmm::cuda_stream_view stream)
-  {
-    thrust::fill(rmm::exec_policy(stream),
-                 value_data(),
-                 value_data() + size_dataframe_buffer<T>(buffer_),
-                 value);
-  }
-
-  auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
-
-  auto device_view() const
-  {
-    auto value_first = get_dataframe_buffer_cbegin<T>(buffer_);
-    return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
-  }
-
-  auto mutable_device_view()
-  {
-    auto value_first = get_dataframe_buffer_begin<T>(buffer_);
-    return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
-  }
-
- private:
-  decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
-};
-
-template <typename vertex_t, typename T>
-class minor_properties_t {
+class row_col_properties_t {
  public:
-  minor_properties_t()
+  row_col_properties_t()
     : key_first_(std::nullopt),
       key_last_(std::nullopt),
       buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{}))
   {
   }
 
-  minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
+  row_col_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
     : key_first_(std::nullopt),
       key_last_(std::nullopt),
       buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
   {
   }
 
-  minor_properties_t(raft::handle_t const& handle,
-                     vertex_t const* key_first,
-                     vertex_t const* key_last)
+  row_col_properties_t(raft::handle_t const& handle,
+                       vertex_t const* key_first,
+                       vertex_t const* key_last)
     : key_first_(key_first),
       key_last_(key_last),
       buffer_(
@@ -165,16 +121,18 @@ class minor_properties_t {
                  value);
   }
 
+  auto key_first() { return key_first_; }
+  auto key_last() { return key_last_; }
   auto value_data() { return get_dataframe_buffer_begin<T>(buffer_); }
 
   auto device_view() const
   {
     auto value_first = get_dataframe_buffer_cbegin<T>(buffer_);
     if (key_first_) {
-      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(
         *key_first_, *key_last_, value_first);
     } else {
-      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
@@ -182,10 +140,10 @@ class minor_properties_t {
   {
     auto value_first = get_dataframe_buffer_begin<T>(buffer_);
     if (key_first_) {
-      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(
         *key_first_, *key_last_, value_first);
     } else {
-      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
@@ -214,13 +172,8 @@ auto to_thrust_tuple(Iterator iter)
 
 }  // namespace detail
 
-template <typename GraphViewType, typename T, typename Enable = void>
-class row_properties_t;
-
 template <typename GraphViewType, typename T>
-class row_properties_t<GraphViewType,
-                       T,
-                       std::enable_if_t<GraphViewType::is_adj_matrix_transposed>> {
+class row_properties_t {
  public:
   using value_type = T;
 
@@ -233,43 +186,18 @@ class row_properties_t<GraphViewType,
     auto key_first = graph_view.get_local_sorted_unique_edge_row_begin();
     auto key_last  = graph_view.get_local_sorted_unique_edge_row_end();
     if (key_first) {
-      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
         handle, *key_first, *key_last);
     } else {
-      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
         handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
     }
   }
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
-  auto value_data() { return properties_.value_data(); }
-
-  auto device_view() const { return properties_.device_view(); }
-  auto mutable_device_view() { return properties_.mutable_device_view(); }
-
- private:
-  detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
-};
-
-template <typename GraphViewType, typename T>
-class row_properties_t<GraphViewType,
-                       T,
-                       std::enable_if_t<!GraphViewType::is_adj_matrix_transposed>> {
- public:
-  using value_type = T;
-
-  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-
-  row_properties_t() = default;
-
-  row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
-  {
-    properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
-      handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
-  }
-
-  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
+  auto key_first() { return properties_.key_first(); }
+  auto key_last() { return properties_.key_last(); }
 
   auto value_data() { return properties_.value_data(); }
 
@@ -277,44 +205,11 @@ class row_properties_t<GraphViewType,
   auto mutable_device_view() { return properties_.mutable_device_view(); }
 
  private:
-  detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
-};
-
-template <typename GraphViewType, typename T, typename Enable = void>
-class col_properties_t;
-
-template <typename GraphViewType, typename T>
-class col_properties_t<GraphViewType,
-                       T,
-                       std::enable_if_t<GraphViewType::is_adj_matrix_transposed>> {
- public:
-  using value_type = T;
-
-  static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic<T>::value);
-
-  col_properties_t() = default;
-
-  col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
-  {
-    properties_ = detail::major_properties_t<typename GraphViewType::vertex_type, T>(
-      handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
-  }
-
-  void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
-
-  auto value_data() { return properties_.value_data(); }
-
-  auto device_view() const { return properties_.device_view(); }
-  auto mutable_device_view() { return properties_.mutable_device_view(); }
-
- private:
-  detail::major_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+  detail::row_col_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
 
 template <typename GraphViewType, typename T>
-class col_properties_t<GraphViewType,
-                       T,
-                       std::enable_if_t<!GraphViewType::is_adj_matrix_transposed>> {
+class col_properties_t {
  public:
   using value_type = T;
 
@@ -327,23 +222,26 @@ class col_properties_t<GraphViewType,
     auto key_first = graph_view.get_local_sorted_unique_edge_col_begin();
     auto key_last  = graph_view.get_local_sorted_unique_edge_col_end();
     if (key_first) {
-      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
         handle, *key_first, *key_last);
     } else {
-      properties_ = detail::minor_properties_t<typename GraphViewType::vertex_type, T>(
+      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
         handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
     }
   }
 
   void fill(T value, rmm::cuda_stream_view stream) { properties_.fill(value, stream); }
 
+  auto key_first() { return properties_.key_first(); }
+  auto key_last() { return properties_.key_last(); }
+
   auto value_data() { return properties_.value_data(); }
 
   auto device_view() const { return properties_.device_view(); }
   auto mutable_device_view() { return properties_.mutable_device_view(); }
 
  private:
-  detail::minor_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+  detail::row_col_properties_t<typename GraphViewType::vertex_type, T> properties_{};
 };
 
 template <typename vertex_t>
@@ -365,11 +263,11 @@ class dummy_properties_t {
 };
 
 template <typename vertex_t, typename... Ts>
-auto device_view_concat(detail::major_properties_device_view_t<vertex_t, Ts>... device_views)
+auto device_view_concat(detail::row_col_properties_device_view_t<vertex_t, Ts>... device_views)
 {
   auto concat_first = thrust::make_zip_iterator(
     thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...));
-  return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
+  return detail::row_col_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
 }
 
 }  // namespace cugraph
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 29153fc2d37..b409136ad85 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -260,11 +260,11 @@ class Louvain {
       current_graph_view_,
       graph_view_t::is_multi_gpu
         ? src_clusters_cache_.device_view()
-        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.begin()),
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.begin()),
       [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) {
         if (src_cluster == nbr_cluster) {
@@ -396,11 +396,11 @@ class Louvain {
       current_graph_view_,
       graph_view_t::is_multi_gpu
         ? src_clusters_cache_.device_view()
-        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
         weight_t sum{0};
@@ -491,13 +491,13 @@ class Louvain {
                              src_cluster_weights.device_view(),
                              src_old_cluster_sum_subtract_pairs.device_view())
         : device_view_concat(
-            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
+            detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_weights_v_.data()),
-            detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+            detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
               next_clusters_v_.data()),
-            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
+            detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_cluster_weights_v.data()),
-            detail::major_properties_device_view_t<vertex_t,
+            detail::row_col_properties_device_view_t<vertex_t,
                                                    decltype(cluster_old_sum_subtract_pair_first)>(
               cluster_old_sum_subtract_pair_first));
 
@@ -507,7 +507,7 @@ class Louvain {
       zipped_src_device_view,
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       cluster_keys_v_.begin(),
       cluster_keys_v_.end(),
@@ -540,7 +540,7 @@ class Louvain {
         dummy_properties_t<vertex_t>{}.device_view(),
         graph_view_t::is_multi_gpu
           ? src_clusters_cache_.device_view()
-          : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
+          : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
               next_clusters_v_.data()),
         detail::return_edge_weight_t<vertex_t, weight_t>{},
         weight_t{0});
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 0440c6362c7..a3f6152fa95 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -545,7 +545,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         [col_components =
            GraphViewType::is_multi_gpu
              ? adj_matrix_col_components.mutable_device_view()
-             : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
+             : detail::row_col_properties_device_view_t<vertex_t, vertex_t*>(level_components),
          col_first = level_graph_view.get_local_adj_matrix_partition_col_first(),
          edge_buffer_first =
            get_dataframe_buffer_begin<thrust::tuple<vertex_t, vertex_t>>(edge_buffer),
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 716c9f67993..af8b32ef708 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -498,7 +498,7 @@ coarsen_graph(
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
         graph_view.get_matrix_partition_view()),
       labels,
-      detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(labels),
+      detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(labels),
       graph_view.get_local_adj_matrix_partition_segment_offsets(0));
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index ba91d485d65..3a95fdb8fbc 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -167,7 +167,7 @@ void sssp(raft::handle_t const& handle,
       std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
       GraphViewType::is_multi_gpu
         ? adj_matrix_row_distances.device_view()
-        : detail::major_properties_device_view_t<vertex_t, weight_t const*>(distances),
+        : detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(distances),
       dummy_properties_t<vertex_t>{}.device_view(),
       [vertex_partition, distances, cutoff] __device__(
         vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) {

From 322490da607a223396ba41a16ab9d282a505707b Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 14 Sep 2021 12:45:23 -0400
Subject: [PATCH 37/57] update device_bcast to call bcast taking separate input
 and output iterators

---
 cpp/include/cugraph/utilities/device_comm.cuh | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cugraph/utilities/device_comm.cuh b/cpp/include/cugraph/utilities/device_comm.cuh
index d7a9b9c4983..9500f114c5a 100644
--- a/cpp/include/cugraph/utilities/device_comm.cuh
+++ b/cpp/include/cugraph/utilities/device_comm.cuh
@@ -376,11 +376,8 @@ device_bcast_impl(raft::comms::comms_t const& comm,
 {
   static_assert(std::is_same<typename std::iterator_traits<InputIterator>::value_type,
                              typename std::iterator_traits<OutputIterator>::value_type>::value);
-  if (comm.get_rank() == root) {
-    comm.bcast(iter_to_raw_ptr(input_first), count, root, stream_view.value());
-  } else {
-    comm.bcast(iter_to_raw_ptr(output_first), count, root, stream_view.value());
-  }
+  comm.bcast(
+    iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value());
 }
 
 template <typename InputIterator, typename OutputIterator, size_t I, size_t N>

From 1022012b29ec7efe49d3bf37a105adaa091a6f17 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 14 Sep 2021 13:25:57 -0400
Subject: [PATCH 38/57] compute unique edge row/col offsets in graph
 constructor

---
 cpp/include/cugraph/graph.hpp         |  4 ++
 cpp/include/cugraph/graph_view.hpp    | 22 ++++++++--
 cpp/src/structure/graph_impl.cuh      | 62 +++++++++++++++++++++++++--
 cpp/src/structure/graph_view_impl.cuh |  8 +++-
 4 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp
index 22629c63458..f03cc81e9cd 100644
--- a/cpp/include/cugraph/graph.hpp
+++ b/cpp/include/cugraph/graph.hpp
@@ -146,6 +146,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_rows_).data() +
                                            (*local_sorted_unique_edge_rows_).size()}
           : std::nullopt,
+        local_sorted_unique_edge_row_offsets_,
         local_sorted_unique_edge_cols_
           ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_cols_).data()}
           : std::nullopt,
@@ -153,6 +154,7 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
           ? std::optional<vertex_t const*>{(*local_sorted_unique_edge_cols_).data() +
                                            (*local_sorted_unique_edge_cols_).size()}
           : std::nullopt,
+        local_sorted_unique_edge_col_offsets_,
       },
       false);
   }
@@ -177,6 +179,8 @@ class graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enab
   // rows/cols << V / row_comm_size|col_comm_size).
   std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_rows_{std::nullopt};
   std::optional<rmm::device_uvector<vertex_t>> local_sorted_unique_edge_cols_{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_row_offsets_{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_col_offsets_{std::nullopt};
 };
 
 // single-GPU version
diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 804b765ae4b..2475cb71995 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -306,10 +306,12 @@ struct graph_view_meta_t<vertex_t, edge_t, multi_gpu, std::enable_if_t<multi_gpu
   // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered
   std::optional<std::vector<vertex_t>> adj_matrix_partition_segment_offsets{};
 
-  std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{};
-  std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{};
-  std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{};
-  std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{};
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_first{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_row_last{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_row_offsets{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_first{std::nullopt};
+  std::optional<vertex_t const*> local_sorted_unique_edge_col_last{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_col_offsets{std::nullopt};
 };
 
 // single-GPU version
@@ -597,6 +599,11 @@ class graph_view_t<vertex_t,
     return local_sorted_unique_edge_row_last_;
   }
 
+  std::optional<std::vector<vertex_t>> get_local_sorted_unique_edge_row_offsets() const
+  {
+    return local_sorted_unique_edge_row_offsets_;
+  }
+
   std::optional<vertex_t const*> get_local_sorted_unique_edge_col_begin() const
   {
     return local_sorted_unique_edge_col_first_;
@@ -607,6 +614,11 @@ class graph_view_t<vertex_t,
     return local_sorted_unique_edge_col_last_;
   }
 
+  std::optional<std::vector<vertex_t>> get_local_sorted_unique_edge_col_offsets() const
+  {
+    return local_sorted_unique_edge_col_offsets_;
+  }
+
  private:
   std::vector<edge_t const*> adj_matrix_partition_offsets_{};
   std::vector<vertex_t const*> adj_matrix_partition_indices_{};
@@ -627,8 +639,10 @@ class graph_view_t<vertex_t,
   // rows/cols << V / row_comm_size|col_comm_size).
   std::optional<vertex_t const*> local_sorted_unique_edge_row_first_{std::nullopt};
   std::optional<vertex_t const*> local_sorted_unique_edge_row_last_{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_row_offsets_{std::nullopt};
   std::optional<vertex_t const*> local_sorted_unique_edge_col_first_{std::nullopt};
   std::optional<vertex_t const*> local_sorted_unique_edge_col_last_{std::nullopt};
+  std::optional<std::vector<vertex_t>> local_sorted_unique_edge_col_offsets_{std::nullopt};
 };
 
 // single-GPU version
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 3f6cc1a60fd..e1371586dd8 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -435,10 +435,37 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
           has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), major_first}));
     }
     assert(cur_size == num_local_unique_edge_majors);
+
+    std::vector<vertex_t> h_vertex_partition_firsts(col_comm_size - 1);
+    for (int i = 1; i < col_comm_size; ++i) {
+      h_vertex_partition_firsts[i - 1] =
+        partition_.get_vertex_partition_first(i * row_comm_size + row_comm_rank);
+    }
+    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
+                                                            handle.get_stream());
+    raft::update_device(d_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
+                                                handle.get_stream());
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        local_sorted_unique_edge_majors.begin(),
+                        local_sorted_unique_edge_majors.end(),
+                        d_vertex_partition_firsts.begin(),
+                        d_vertex_partition_firsts.end(),
+                        d_key_offsets.begin());
+    std::vector<vertex_t> h_key_offsets(col_comm_size + 1, vertex_t{0});
+    h_key_offsets.back() = static_cast<vertex_t>(local_sorted_unique_edge_majors.size());
+    raft::update_host(
+      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
+
     if constexpr (store_transposed) {
-      local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_majors);
+      local_sorted_unique_edge_cols_        = std::move(local_sorted_unique_edge_majors);
+      local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
     } else {
-      local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_majors);
+      local_sorted_unique_edge_rows_        = std::move(local_sorted_unique_edge_majors);
+      local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets);
     }
   }
 
@@ -479,10 +506,37 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                       local_sorted_unique_edge_minors.end())),
       handle.get_stream());
     local_sorted_unique_edge_minors.shrink_to_fit(handle.get_stream());
+
+    std::vector<vertex_t> h_vertex_partition_firsts(row_comm_size - 1);
+    for (int i = 1; i < row_comm_size; ++i) {
+      h_vertex_partition_firsts[i - 1] =
+        partition_.get_vertex_partition_first(col_comm_rank * row_comm_size + i);
+    }
+    rmm::device_uvector<vertex_t> d_vertex_partition_firsts(h_vertex_partition_firsts.size(),
+                                                            handle.get_stream());
+    raft::update_device(d_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.data(),
+                        h_vertex_partition_firsts.size(),
+                        handle.get_stream());
+    rmm::device_uvector<vertex_t> d_key_offsets(d_vertex_partition_firsts.size(),
+                                                handle.get_stream());
+    thrust::lower_bound(handle.get_thrust_policy(),
+                        local_sorted_unique_edge_minors.begin(),
+                        local_sorted_unique_edge_minors.end(),
+                        d_vertex_partition_firsts.begin(),
+                        d_vertex_partition_firsts.end(),
+                        d_key_offsets.begin());
+    std::vector<vertex_t> h_key_offsets(row_comm_size + 1, vertex_t{0});
+    h_key_offsets.back() = static_cast<vertex_t>(local_sorted_unique_edge_minors.size());
+    raft::update_host(
+      h_key_offsets.data() + 1, d_key_offsets.data(), d_key_offsets.size(), handle.get_stream());
+
     if constexpr (store_transposed) {
-      local_sorted_unique_edge_rows_ = std::move(local_sorted_unique_edge_minors);
+      local_sorted_unique_edge_rows_        = std::move(local_sorted_unique_edge_minors);
+      local_sorted_unique_edge_row_offsets_ = std::move(h_key_offsets);
     } else {
-      local_sorted_unique_edge_cols_ = std::move(local_sorted_unique_edge_minors);
+      local_sorted_unique_edge_cols_        = std::move(local_sorted_unique_edge_minors);
+      local_sorted_unique_edge_col_offsets_ = std::move(h_key_offsets);
     }
   }
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 368573d4a91..027697630e6 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -179,7 +179,13 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                                               meta.adj_matrix_partition_segment_offsets,
                                               handle.get_stream())),
     partition_(meta.partition),
-    adj_matrix_partition_segment_offsets_(meta.adj_matrix_partition_segment_offsets)
+    adj_matrix_partition_segment_offsets_(meta.adj_matrix_partition_segment_offsets),
+    local_sorted_unique_edge_row_first_(meta.local_sorted_unique_edge_row_first),
+    local_sorted_unique_edge_row_last_(meta.local_sorted_unique_edge_row_last),
+    local_sorted_unique_edge_row_offsets_(meta.local_sorted_unique_edge_row_offsets),
+    local_sorted_unique_edge_col_first_(meta.local_sorted_unique_edge_col_first),
+    local_sorted_unique_edge_col_last_(meta.local_sorted_unique_edge_col_last),
+    local_sorted_unique_edge_col_offsets_(meta.local_sorted_unique_edge_col_offsets)
 {
   // cheap error checks
 

From d737d4822a646b4852f930dbc352e12665905456 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 14 Sep 2021 15:57:22 -0400
Subject: [PATCH 39/57] update copy_to_adj_matrix_row/col to handle (key,
 value) pairs

---
 .../prims/copy_to_adj_matrix_row_col.cuh      | 233 +++++++++++++-----
 1 file changed, 168 insertions(+), 65 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index ab27e7cc3c7..b03e653db27 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -30,11 +30,13 @@
 #include <raft/handle.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/permutation_iterator.h>
 
+#include <algorithm>
 #include <numeric>
 #include <type_traits>
 #include <utility>
@@ -51,7 +53,9 @@ void copy_to_matrix_major(raft::handle_t const& handle,
                           VertexValueInputIterator vertex_value_input_first,
                           MatrixMajorValueOutputWrapper& matrix_major_value_output)
 {
-  if (GraphViewType::is_multi_gpu) {
+  if constexpr (GraphViewType::is_multi_gpu) {
+    using vertex_t = typename GraphViewType::vertex_type;
+
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -72,18 +76,53 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-    std::vector<size_t> rx_counts(col_comm_size, size_t{0});
-    std::vector<size_t> displacements(col_comm_size, size_t{0});
-    for (int i = 0; i < col_comm_size; ++i) {
-      rx_counts[i]     = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank);
-      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
+    if (matrix_major_value_output.key_first()) {
+      auto key_offsets = GraphViewType::is_adj_matrix_transposed
+                           ? *(graph_view.get_local_sorted_unique_edge_col_offsets())
+                           : *(graph_view.get_local_sorted_unique_edge_row_offsets());
+
+      vertex_t max_rx_size{0};
+      for (int i = 0; i < col_comm_size; ++i) {
+        max_rx_size = std::max(
+          max_rx_size, graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank));
+      }
+      auto rx_value_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(max_rx_size,
+                                                                             handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_value_buffer);
+      for (int i = 0; i < col_comm_size; ++i) {
+        device_bcast(col_comm,
+                     vertex_value_input_first,
+                     rx_value_first,
+                     graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank),
+                     i,
+                     handle.get_stream());
+
+        auto v_offset_first = thrust::make_transform_iterator(
+          *(matrix_major_value_output.key_first()) + key_offsets[i],
+          [v_first = graph_view.get_vertex_partition_first(
+             i * row_comm_size + row_comm_rank)] __device__(auto v) { return v - v_first; });
+        thrust::gather(handle.get_thrust_policy(),
+                       v_offset_first,
+                       v_offset_first + (key_offsets[i + 1] - key_offsets[i]),
+                       rx_value_first,
+                       matrix_major_value_output.value_data() + key_offsets[i]);
+      }
+    } else {
+      std::vector<size_t> rx_counts(col_comm_size, size_t{0});
+      std::vector<size_t> displacements(col_comm_size, size_t{0});
+      for (int i = 0; i < col_comm_size; ++i) {
+        rx_counts[i]     = graph_view.get_vertex_partition_size(i * row_comm_size + row_comm_rank);
+        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
+      }
+      device_allgatherv(col_comm,
+                        vertex_value_input_first,
+                        matrix_major_value_output.value_data(),
+                        rx_counts,
+                        displacements,
+                        handle.get_stream());
     }
-    device_allgatherv(col_comm,
-                      vertex_value_input_first,
-                      matrix_major_value_output.value_data(),
-                      rx_counts,
-                      displacements,
-                      handle.get_stream());
 
     // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
     // two different communicators (end of col_comm)
@@ -96,6 +135,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
   } else {
+    assert(!(matrix_major_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_cols()
              : graph_view.get_number_of_local_adj_matrix_partition_rows());
@@ -121,7 +161,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
   using weight_t = typename GraphViewType::weight_type;
 
-  if (GraphViewType::is_multi_gpu) {
+  if constexpr (GraphViewType::is_multi_gpu) {
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -146,20 +186,26 @@ void copy_to_matrix_major(raft::handle_t const& handle,
       host_scalar_allgather(col_comm,
                             static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
                             handle.get_stream());
+    auto max_rx_size =
+      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+        return std::max(lhs, rhs);
+      });
+    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
+    auto rx_tmp_buffer = allocate_dataframe_buffer<
+      typename std::iterator_traits<VertexValueInputIterator>::value_type>(max_rx_size,
+                                                                           handle.get_stream());
+    auto rx_value_first = get_dataframe_buffer_begin<
+      typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+    auto key_offsets = GraphViewType::is_adj_matrix_transposed
+                         ? graph_view.get_local_sorted_unique_edge_col_offsets()
+                         : graph_view.get_local_sorted_unique_edge_row_offsets();
 
     for (int i = 0; i < col_comm_size; ++i) {
       auto matrix_partition =
         matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
           graph_view.get_matrix_partition_view(i));
 
-      rmm::device_uvector<vertex_t> rx_vertices(col_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                handle.get_stream());
-      auto rx_tmp_buffer = allocate_dataframe_buffer<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
-                                                                             handle.get_stream());
-      auto rx_value_first = get_dataframe_buffer_begin<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
-
       if (col_comm_rank == i) {
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
@@ -183,19 +229,24 @@ void copy_to_matrix_major(raft::handle_t const& handle,
         col_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
       device_bcast(col_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
 
-      if (col_comm_rank == i) {
-        auto map_first =
-          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
-            return matrix_partition.get_major_offset_from_major_nocheck(v);
-          });
-        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-        // directly scatters from the internal buffer)
-        thrust::scatter(
+      if (matrix_major_value_output.key_first()) {
+        thrust::for_each(
           handle.get_thrust_policy(),
-          rx_value_first,
-          rx_value_first + rx_counts[i],
-          map_first,
-          matrix_major_value_output.value_data() + matrix_partition.get_major_value_start_offset());
+          thrust::make_counting_iterator(vertex_t{0}),
+          thrust::make_counting_iterator((*key_offsets)[i + 1] - (*key_offsets)[i]),
+          [rx_vertex_first = rx_vertices.begin(),
+           rx_vertex_last  = rx_vertices.end(),
+           rx_value_first,
+           output_key_first = *(matrix_major_value_output.key_first()) + (*key_offsets)[i],
+           output_value_first =
+             matrix_major_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) {
+            auto major = *(output_key_first + i);
+            auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, major);
+            if (*it == major) {
+              auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
+              *(output_value_first + i) = rx_value;
+            }
+          });
       } else {
         auto map_first = thrust::make_transform_iterator(
           rx_vertices.begin(), [matrix_partition] __device__(auto v) {
@@ -223,6 +274,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
   } else {
+    assert(!(matrix_major_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_cols()
              : graph_view.get_number_of_local_adj_matrix_partition_rows());
@@ -243,7 +295,9 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
                           VertexValueInputIterator vertex_value_input_first,
                           MatrixMinorValueOutputWrapper& matrix_minor_value_output)
 {
-  if (GraphViewType::is_multi_gpu) {
+  if constexpr (GraphViewType::is_multi_gpu) {
+    using vertex_t = typename GraphViewType::vertex_type;
+
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -264,18 +318,53 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-    std::vector<size_t> rx_counts(row_comm_size, size_t{0});
-    std::vector<size_t> displacements(row_comm_size, size_t{0});
-    for (int i = 0; i < row_comm_size; ++i) {
-      rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
-      displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
+    if (matrix_minor_value_output.key_first()) {
+      auto key_offsets = GraphViewType::is_adj_matrix_transposed
+                           ? *(graph_view.get_local_sorted_unique_edge_row_offsets())
+                           : *(graph_view.get_local_sorted_unique_edge_col_offsets());
+
+      vertex_t max_rx_size{0};
+      for (int i = 0; i < row_comm_size; ++i) {
+        max_rx_size = std::max(
+          max_rx_size, graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i));
+      }
+      auto rx_value_buffer = allocate_dataframe_buffer<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(max_rx_size,
+                                                                             handle.get_stream());
+      auto rx_value_first = get_dataframe_buffer_begin<
+        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_value_buffer);
+      for (int i = 0; i < row_comm_size; ++i) {
+        device_bcast(row_comm,
+                     vertex_value_input_first,
+                     rx_value_first,
+                     graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i),
+                     i,
+                     handle.get_stream());
+
+        auto v_offset_first = thrust::make_transform_iterator(
+          *(matrix_minor_value_output.key_first()) + key_offsets[i],
+          [v_first = graph_view.get_vertex_partition_first(
+             col_comm_rank * row_comm_size + i)] __device__(auto v) { return v - v_first; });
+        thrust::gather(handle.get_thrust_policy(),
+                       v_offset_first,
+                       v_offset_first + (key_offsets[i + 1] - key_offsets[i]),
+                       rx_value_first,
+                       matrix_minor_value_output.value_data() + key_offsets[i]);
+      }
+    } else {
+      std::vector<size_t> rx_counts(row_comm_size, size_t{0});
+      std::vector<size_t> displacements(row_comm_size, size_t{0});
+      for (int i = 0; i < row_comm_size; ++i) {
+        rx_counts[i]     = graph_view.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
+        displacements[i] = (i == 0) ? 0 : displacements[i - 1] + rx_counts[i - 1];
+      }
+      device_allgatherv(row_comm,
+                        vertex_value_input_first,
+                        matrix_minor_value_output.value_data(),
+                        rx_counts,
+                        displacements,
+                        handle.get_stream());
     }
-    device_allgatherv(row_comm,
-                      vertex_value_input_first,
-                      matrix_minor_value_output.value_data(),
-                      rx_counts,
-                      displacements,
-                      handle.get_stream());
 
     // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between
     // two different communicators (end of row_comm)
@@ -288,6 +377,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
   } else {
+    assert(!(matrix_minor_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() == GraphViewType::is_adj_matrix_transposed
              ? graph_view.get_number_of_local_adj_matrix_partition_rows()
              : graph_view.get_number_of_local_adj_matrix_partition_cols());
@@ -313,7 +403,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
   using edge_t   = typename GraphViewType::edge_type;
   using weight_t = typename GraphViewType::weight_type;
 
-  if (GraphViewType::is_multi_gpu) {
+  if constexpr (GraphViewType::is_multi_gpu) {
     auto& comm               = handle.get_comms();
     auto const comm_rank     = comm.get_rank();
     auto& row_comm           = handle.get_subcomm(cugraph::partition_2d::key_naming_t().row_name());
@@ -338,19 +428,25 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
       host_scalar_allgather(row_comm,
                             static_cast<size_t>(thrust::distance(vertex_first, vertex_last)),
                             handle.get_stream());
+    auto max_rx_size =
+      std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) {
+        return std::max(lhs, rhs);
+      });
+    rmm::device_uvector<vertex_t> rx_vertices(max_rx_size, handle.get_stream());
+    auto rx_tmp_buffer = allocate_dataframe_buffer<
+      typename std::iterator_traits<VertexValueInputIterator>::value_type>(max_rx_size,
+                                                                           handle.get_stream());
+    auto rx_value_first = get_dataframe_buffer_begin<
+      typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
+
+    auto key_offsets = GraphViewType::is_adj_matrix_transposed
+                         ? graph_view.get_local_sorted_unique_edge_row_offsets()
+                         : graph_view.get_local_sorted_unique_edge_col_offsets();
 
     auto matrix_partition =
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, GraphViewType::is_multi_gpu>(
         graph_view.get_matrix_partition_view(size_t{0}));
     for (int i = 0; i < row_comm_size; ++i) {
-      rmm::device_uvector<vertex_t> rx_vertices(row_comm_rank == i ? size_t{0} : rx_counts[i],
-                                                handle.get_stream());
-      auto rx_tmp_buffer = allocate_dataframe_buffer<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_counts[i],
-                                                                             handle.get_stream());
-      auto rx_value_first = get_dataframe_buffer_begin<
-        typename std::iterator_traits<VertexValueInputIterator>::value_type>(rx_tmp_buffer);
-
       if (row_comm_rank == i) {
         auto vertex_partition =
           vertex_partition_device_view_t<vertex_t, GraphViewType::is_multi_gpu>(
@@ -374,18 +470,24 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
         row_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream());
       device_bcast(row_comm, rx_value_first, rx_value_first, rx_counts[i], i, handle.get_stream());
 
-      if (row_comm_rank == i) {
-        auto map_first =
-          thrust::make_transform_iterator(vertex_first, [matrix_partition] __device__(auto v) {
-            return matrix_partition.get_minor_offset_from_minor_nocheck(v);
+      if (matrix_minor_value_output.key_first()) {
+        thrust::for_each(
+          handle.get_thrust_policy(),
+          thrust::make_counting_iterator(vertex_t{0}),
+          thrust::make_counting_iterator((*key_offsets)[i + 1] - (*key_offsets)[i]),
+          [rx_vertex_first = rx_vertices.begin(),
+           rx_vertex_last  = rx_vertices.end(),
+           rx_value_first,
+           output_key_first = *(matrix_minor_value_output.key_first()) + (*key_offsets)[i],
+           output_value_first =
+             matrix_minor_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) {
+            auto minor = *(output_key_first + i);
+            auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, minor);
+            if (*it == minor) {
+              auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
+              *(output_value_first + i) = rx_value;
+            }
           });
-        // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and
-        // directly scatters from the internal buffer)
-        thrust::scatter(handle.get_thrust_policy(),
-                        rx_value_first,
-                        rx_value_first + rx_counts[i],
-                        map_first,
-                        matrix_minor_value_output.value_data());
       } else {
         auto map_first = thrust::make_transform_iterator(
           rx_vertices.begin(), [matrix_partition] __device__(auto v) {
@@ -412,6 +514,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
     comm.barrier();  // currently, this is ncclAllReduce
 #endif
   } else {
+    assert(!(matrix_minor_value_output.key_first()));
     assert(graph_view.get_number_of_local_vertices() ==
            graph_view.get_number_of_local_adj_matrix_partition_rows());
     auto val_first = thrust::make_permutation_iterator(vertex_value_input_first, vertex_first);

From 540b9734e4005a8f696e884cd6405867e7976f0f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Tue, 14 Sep 2021 21:35:22 -0400
Subject: [PATCH 40/57] update code calling device_bcast

---
 ...ransform_reduce_key_aggregated_out_nbr.cuh | 12 ----------
 .../update_frontier_v_push_if_out_nbr.cuh     | 19 ++-------------
 cpp/src/structure/coarsen_graph_impl.cuh      |  8 +------
 cpp/src/structure/renumber_edgelist_impl.cuh  | 24 ++++++++++++-------
 4 files changed, 18 insertions(+), 45 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index 7d68cf8d13d..58b5c19ed05 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -425,18 +425,6 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
                    i,
                    handle.get_stream());
     }
-    // FIXME: these copies are unnecessary, better fix RAFT comm's bcast to take separate input &
-    // output pointers
-    auto execution_policy = handle.get_thrust_policy();
-    thrust::copy(execution_policy,
-                 map_unique_key_first,
-                 map_unique_key_last,
-                 map_keys.begin() + map_displacements[row_comm_rank]);
-    thrust::copy(
-      execution_policy,
-      map_value_first,
-      map_value_first + thrust::distance(map_unique_key_first, map_unique_key_last),
-      get_dataframe_buffer_begin<value_t>(map_value_buffer) + map_displacements[row_comm_rank]);
 
     handle.get_stream_view().synchronize();  // cuco::static_map currently does not take stream
 
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
index 77d0aaab2ec..507d54ecac8 100644
--- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -654,18 +654,10 @@ typename GraphViewType::edge_type compute_num_out_nbrs_from_frontier(
 
       rmm::device_uvector<vertex_t> frontier_vertices(local_frontier_sizes[i],
                                                       handle.get_stream_view());
-      // FIXME: this copy is unnecessary, better fix RAFT comm's bcast to take const iterators for
-      // input
-      if (col_comm_rank == static_cast<int>(i)) {
-        thrust::copy(execution_policy,
-                     local_frontier_vertex_first,
-                     local_frontier_vertex_last,
-                     frontier_vertices.begin());
-      }
       device_bcast(col_comm,
+                   local_frontier_vertex_first,
                    frontier_vertices.data(),
-                   frontier_vertices.data(),
-                   frontier_vertices.size(),
+                   local_frontier_sizes[i],
                    static_cast<int>(i),
                    handle.get_stream());
 
@@ -893,13 +885,6 @@ void update_frontier_v_push_if_out_nbr(
       resize_dataframe_buffer<key_t>(
         matrix_partition_frontier_key_buffer, matrix_partition_frontier_size, handle.get_stream());
 
-      if (static_cast<size_t>(col_comm_rank) == i) {
-        thrust::copy(handle.get_thrust_policy(),
-                     frontier_key_first,
-                     frontier_key_last,
-                     get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer));
-      }
-
       device_bcast(col_comm,
                    frontier_key_first,
                    get_dataframe_buffer_begin<key_t>(matrix_partition_frontier_key_buffer),
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index af8b32ef708..1fe22f4c902 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -269,14 +269,8 @@ coarsen_graph(
       store_transposed ? graph_view.get_number_of_local_adj_matrix_partition_cols(i)
                        : graph_view.get_number_of_local_adj_matrix_partition_rows(i),
       handle.get_stream());
-    if (col_comm_rank == static_cast<int>(i)) {
-      // FIXME: this copy is unnecessary, beter fix RAFT comm's bcast to take const iterators for
-      // input
-      thrust::copy(
-        handle.get_thrust_policy(), labels, labels + major_labels.size(), major_labels.begin());
-    }
     device_bcast(col_comm,
-                 major_labels.data(),
+                 labels,
                  major_labels.data(),
                  major_labels.size(),
                  static_cast<int>(i),
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 56aca7725ea..75b04bf8b6b 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -701,11 +701,14 @@ renumber_edgelist(
   comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
+  vertex_t max_matrix_partition_major_size{0};
+  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    max_matrix_partition_major_size =
+      std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i));
+  }
+  rmm::device_uvector<vertex_t> renumber_map_major_labels(max_matrix_partition_major_size,
+                                                          handle.get_stream());
   for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
-    rmm::device_uvector<vertex_t> renumber_map_major_labels(
-      col_comm_rank == static_cast<int>(i) ? vertex_t{0}
-                                           : partition.get_matrix_partition_major_size(i),
-      handle.get_stream());
     device_bcast(col_comm,
                  renumber_map_labels.data(),
                  renumber_map_major_labels.data(),
@@ -728,8 +731,7 @@ renumber_edgelist(
         invalid_vertex_id<vertex_t>::value,
         stream_adapter};
     auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
-      col_comm_rank == static_cast<int>(i) ? renumber_map_labels.begin()
-                                           : renumber_map_major_labels.begin(),
+      renumber_map_major_labels.begin(),
       thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i))));
     renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
     renumber_map.find(edgelist_major_vertices[i],
@@ -750,10 +752,14 @@ renumber_edgelist(
   if ((partition.get_matrix_partition_minor_size() >= number_of_edges / comm_size) &&
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
+    vertex_t max_segment_size{0};
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      max_segment_size = std::max(
+        max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i));
+    }
+    rmm::device_uvector<vertex_t> renumber_map_minor_labels(max_segment_size, handle.get_stream());
     for (int i = 0; i < row_comm_size; ++i) {
       auto segment_size = partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i);
-      rmm::device_uvector<vertex_t> renumber_map_minor_labels(
-        row_comm_rank == i ? vertex_t{0} : segment_size, handle.get_stream());
       device_bcast(row_comm,
                    renumber_map_labels.data(),
                    renumber_map_minor_labels.data(),
@@ -776,7 +782,7 @@ renumber_edgelist(
                      invalid_vertex_id<vertex_t>::value,
                      stream_adapter};
       auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
-        row_comm_rank == i ? renumber_map_labels.begin() : renumber_map_minor_labels.begin(),
+        renumber_map_minor_labels.begin(),
         thrust::make_counting_iterator(
           partition.get_vertex_partition_first(col_comm_rank * row_comm_size + i))));
       renumber_map.insert(pair_first, pair_first + segment_size);

From 947ef7beaf4f9c65b89e8bf7568b7dd196980702 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 10:01:58 -0400
Subject: [PATCH 41/57] bug fix

---
 .../structure/create_graph_from_edgelist_impl.hpp   | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.hpp b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
index 69e16266bcd..e9f670ef4a4 100644
--- a/cpp/src/structure/create_graph_from_edgelist_impl.hpp
+++ b/cpp/src/structure/create_graph_from_edgelist_impl.hpp
@@ -131,11 +131,14 @@ create_graph_from_edgelist_impl(raft::handle_t const& handle,
     cugraph::graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu>(
       handle,
       edgelists,
-      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{meta.number_of_vertices,
-                                                         meta.number_of_edges,
-                                                         graph_properties,
-                                                         meta.partition,
-                                                         meta.segment_offsets}),
+      cugraph::graph_meta_t<vertex_t, edge_t, multi_gpu>{
+        meta.number_of_vertices,
+        meta.number_of_edges,
+        graph_properties,
+        meta.partition,
+        meta.segment_offsets,
+        store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors,
+        store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}),
     std::optional<rmm::device_uvector<vertex_t>>{std::move(renumber_map_labels)});
 }
 

From e2d5e5833a31dd2eb4ef40ea1e11099959c0dfd4 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 14:11:47 -0400
Subject: [PATCH 42/57] bug fix

---
 cpp/src/structure/coarsen_graph_impl.cuh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 1fe22f4c902..3d9e99a6fea 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -460,7 +460,9 @@ coarsen_graph(
         meta.number_of_edges,
         graph_properties_t{graph_view.is_symmetric(), false},
         meta.partition,
-        meta.segment_offsets}),
+        meta.segment_offsets,
+        store_transposed ? meta.num_local_unique_edge_minors : meta.num_local_unique_edge_majors,
+        store_transposed ? meta.num_local_unique_edge_majors : meta.num_local_unique_edge_minors}),
     std::move(renumber_map_labels));
 }
 

From 593a3102e7f9900ae58d315ef7809dea491f343d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 14:36:16 -0400
Subject: [PATCH 43/57] update host_scalar_(all)reduce to take reduction op

---
 cpp/include/cugraph/prims/count_if_v.cuh      |  4 +--
 cpp/include/cugraph/prims/reduce_v.cuh        |  6 ++--
 .../cugraph/prims/transform_reduce_e.cuh      |  3 +-
 .../cugraph/prims/transform_reduce_v.cuh      |  6 ++--
 cpp/include/cugraph/prims/vertex_frontier.cuh |  6 ++--
 .../cugraph/utilities/host_scalar_comm.cuh    | 28 +++++++++++--------
 cpp/src/community/louvain.cuh                 |  4 +--
 .../weakly_connected_components_impl.cuh      |  2 +-
 cpp/src/link_analysis/pagerank_impl.cuh       | 13 +++++----
 cpp/src/structure/graph_impl.cuh              |  4 +--
 cpp/src/structure/graph_view_impl.cuh         |  6 ++--
 cpp/src/structure/renumber_edgelist_impl.cuh  |  1 +
 cpp/src/traversal/bfs_impl.cuh                |  3 +-
 cpp/src/utilities/cython.cu                   |  6 ++--
 14 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cugraph/prims/count_if_v.cuh b/cpp/include/cugraph/prims/count_if_v.cuh
index b2d4283d859..05f778c9a0c 100644
--- a/cpp/include/cugraph/prims/count_if_v.cuh
+++ b/cpp/include/cugraph/prims/count_if_v.cuh
@@ -59,7 +59,7 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
                      vertex_value_input_first + graph_view.get_number_of_local_vertices(),
                      v_op);
   if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+    count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
   }
   return count;
 }
@@ -94,7 +94,7 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
 {
   auto count = thrust::count_if(handle.get_thrust_policy(), input_first, input_last, v_op);
   if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, handle.get_stream());
+    count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
   }
   return count;
 }
diff --git a/cpp/include/cugraph/prims/reduce_v.cuh b/cpp/include/cugraph/prims/reduce_v.cuh
index f41774675fb..998d9cdb917 100644
--- a/cpp/include/cugraph/prims/reduce_v.cuh
+++ b/cpp/include/cugraph/prims/reduce_v.cuh
@@ -58,7 +58,8 @@ T reduce_v(raft::handle_t const& handle,
     ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
     property_add<T>());
   if (GraphViewType::is_multi_gpu) {
-    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
   }
   return ret;
 }
@@ -95,7 +96,8 @@ T reduce_v(raft::handle_t const& handle,
     ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
     property_add<T>());
   if (GraphViewType::is_multi_gpu) {
-    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
   }
   return ret;
 }
diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh
index f15880b4ec0..990730dee32 100644
--- a/cpp/include/cugraph/prims/transform_reduce_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh
@@ -517,7 +517,8 @@ T transform_reduce_e(raft::handle_t const& handle,
                                edge_property_add);
 
   if (GraphViewType::is_multi_gpu) {
-    result = host_scalar_allreduce(handle.get_comms(), result, handle.get_stream());
+    result = host_scalar_allreduce(
+      handle.get_comms(), result, raft::comms::op_t::SUM, handle.get_stream());
   }
 
   return result;
diff --git a/cpp/include/cugraph/prims/transform_reduce_v.cuh b/cpp/include/cugraph/prims/transform_reduce_v.cuh
index 118db15b38a..812283da838 100644
--- a/cpp/include/cugraph/prims/transform_reduce_v.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_v.cuh
@@ -63,7 +63,8 @@ T transform_reduce_v(raft::handle_t const& handle,
     ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
     property_add<T>());
   if (GraphViewType::is_multi_gpu) {
-    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
   }
   return ret;
 }
@@ -106,7 +107,8 @@ T transform_reduce_v(raft::handle_t const& handle,
     ((GraphViewType::is_multi_gpu) && (handle.get_comms().get_rank() == 0)) ? init : T{},
     property_add<T>());
   if (GraphViewType::is_multi_gpu) {
-    ret = host_scalar_allreduce(handle.get_comms(), ret, handle.get_stream());
+    ret =
+      host_scalar_allreduce(handle.get_comms(), ret, raft::comms::op_t::SUM, handle.get_stream());
   }
   return ret;
 }
diff --git a/cpp/include/cugraph/prims/vertex_frontier.cuh b/cpp/include/cugraph/prims/vertex_frontier.cuh
index 5f5a3225bdc..82e0f4ab880 100644
--- a/cpp/include/cugraph/prims/vertex_frontier.cuh
+++ b/cpp/include/cugraph/prims/vertex_frontier.cuh
@@ -194,8 +194,10 @@ class SortedUniqueKeyBucket {
   template <bool do_aggregate = is_multi_gpu>
   std::enable_if_t<do_aggregate, size_t> aggregate_size() const
   {
-    return host_scalar_allreduce(
-      handle_ptr_->get_comms(), vertices_.size(), handle_ptr_->get_stream());
+    return host_scalar_allreduce(handle_ptr_->get_comms(),
+                                 vertices_.size(),
+                                 raft::comms::op_t::SUM,
+                                 handle_ptr_->get_stream());
   }
 
   template <bool do_aggregate = is_multi_gpu>
diff --git a/cpp/include/cugraph/utilities/host_scalar_comm.cuh b/cpp/include/cugraph/utilities/host_scalar_comm.cuh
index 5675d57e8d3..bc056548dac 100644
--- a/cpp/include/cugraph/utilities/host_scalar_comm.cuh
+++ b/cpp/include/cugraph/utilities/host_scalar_comm.cuh
@@ -68,14 +68,15 @@ template <typename TupleType, size_t I, size_t N>
 struct host_allreduce_tuple_scalar_element_impl {
   void run(raft::comms::comms_t const& comm,
            rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           raft::comms::op_t op,
            cudaStream_t stream) const
   {
     using element_t = typename thrust::tuple_element<I, TupleType>::type;
     static_assert(sizeof(element_t) <= sizeof(int64_t));
     auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    comm.allreduce(ptr, ptr, 1, raft::comms::op_t::SUM, stream);
+    comm.allreduce(ptr, ptr, 1, op, stream);
     host_allreduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
-      comm, tuple_scalar_elements, stream);
+      comm, tuple_scalar_elements, op, stream);
   }
 };
 
@@ -83,6 +84,7 @@ template <typename TupleType, size_t I>
 struct host_allreduce_tuple_scalar_element_impl<TupleType, I, I> {
   void run(raft::comms::comms_t const& comm,
            rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           raft::comms::op_t op,
            cudaStream_t stream) const
   {
   }
@@ -92,15 +94,16 @@ template <typename TupleType, size_t I, size_t N>
 struct host_reduce_tuple_scalar_element_impl {
   void run(raft::comms::comms_t const& comm,
            rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           raft::comms::op_t op,
            int root,
            cudaStream_t stream) const
   {
     using element_t = typename thrust::tuple_element<I, TupleType>::type;
     static_assert(sizeof(element_t) <= sizeof(int64_t));
     auto ptr = reinterpret_cast<element_t*>(tuple_scalar_elements.data() + I);
-    comm.reduce(ptr, ptr, 1, raft::comms::op_t::SUM, root, stream);
+    comm.reduce(ptr, ptr, 1, op, root, stream);
     host_reduce_tuple_scalar_element_impl<TupleType, I + 1, N>().run(
-      comm, tuple_scalar_elements, root, stream);
+      comm, tuple_scalar_elements, op, root, stream);
   }
 };
 
@@ -108,6 +111,7 @@ template <typename TupleType, size_t I>
 struct host_reduce_tuple_scalar_element_impl<TupleType, I, I> {
   void run(raft::comms::comms_t const& comm,
            rmm::device_uvector<int64_t>& tuple_scalar_elements,
+           raft::comms::op_t op,
            int root,
            cudaStream_t stream) const
   {
@@ -118,11 +122,11 @@ struct host_reduce_tuple_scalar_element_impl<TupleType, I, I> {
 
 template <typename T>
 std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+  raft::comms::comms_t const& comm, T input, raft::comms::op_t op, cudaStream_t stream)
 {
   rmm::device_uvector<T> d_input(1, stream);
   raft::update_device(d_input.data(), &input, 1, stream);
-  comm.allreduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  comm.allreduce(d_input.data(), d_input.data(), 1, op, stream);
   T h_input{};
   raft::update_host(&h_input, d_input.data(), 1, stream);
   auto status = comm.sync_stream(stream);
@@ -132,7 +136,7 @@ std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_allreduce(
 
 template <typename T>
 std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<T>::value, T> host_scalar_allreduce(
-  raft::comms::comms_t const& comm, T input, cudaStream_t stream)
+  raft::comms::comms_t const& comm, T input, raft::comms::op_t op, cudaStream_t stream)
 {
   size_t constexpr tuple_size = thrust::tuple_size<T>::value;
   std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
@@ -144,7 +148,7 @@ std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<T>::value, T> host_scala
   raft::update_device(
     d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
   detail::host_allreduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
-    comm, d_tuple_scalar_elements, stream);
+    comm, d_tuple_scalar_elements, op, stream);
   raft::update_host(
     h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
   auto status = comm.sync_stream(stream);
@@ -158,11 +162,11 @@ std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<T>::value, T> host_scala
 // Return value is valid only in root (return value may better be std::optional in C++17 or later)
 template <typename T>
 std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_reduce(
-  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+  raft::comms::comms_t const& comm, T input, raft::comms::op_t op, int root, cudaStream_t stream)
 {
   rmm::device_uvector<T> d_input(1, stream);
   raft::update_device(d_input.data(), &input, 1, stream);
-  comm.reduce(d_input.data(), d_input.data(), 1, raft::comms::op_t::SUM, stream);
+  comm.reduce(d_input.data(), d_input.data(), 1, op, stream);
   T h_input{};
   if (comm.get_rank() == root) { raft::update_host(&h_input, d_input.data(), 1, stream); }
   auto status = comm.sync_stream(stream);
@@ -173,7 +177,7 @@ std::enable_if_t<std::is_arithmetic<T>::value, T> host_scalar_reduce(
 // Return value is valid only in root (return value may better be std::optional in C++17 or later)
 template <typename T>
 std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<T>::value, T> host_scalar_reduce(
-  raft::comms::comms_t const& comm, T input, int root, cudaStream_t stream)
+  raft::comms::comms_t const& comm, T input, raft::comms::op_t op, int root, cudaStream_t stream)
 {
   size_t constexpr tuple_size = thrust::tuple_size<T>::value;
   std::vector<int64_t> h_tuple_scalar_elements(tuple_size);
@@ -185,7 +189,7 @@ std::enable_if_t<cugraph::is_thrust_tuple_of_arithmetic<T>::value, T> host_scala
   raft::update_device(
     d_tuple_scalar_elements.data(), h_tuple_scalar_elements.data(), tuple_size, stream);
   detail::host_reduce_tuple_scalar_element_impl<T, size_t{0}, tuple_size>().run(
-    comm, d_tuple_scalar_elements, root, stream);
+    comm, d_tuple_scalar_elements, op, root, stream);
   if (comm.get_rank() == root) {
     raft::update_host(
       h_tuple_scalar_elements.data(), d_tuple_scalar_elements.data(), tuple_size, stream);
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 613f8da3206..23adca97342 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -251,8 +251,8 @@ class Louvain {
       thrust::plus<weight_t>());
 
     if (graph_view_t::is_multi_gpu) {
-      sum_degree_squared =
-        host_scalar_allreduce(handle_.get_comms(), sum_degree_squared, handle_.get_stream());
+      sum_degree_squared = host_scalar_allreduce(
+        handle_.get_comms(), sum_degree_squared, raft::comms::op_t::SUM, handle_.get_stream());
     }
 
     weight_t sum_internal = transform_reduce_e(
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index c7f214c6314..3fb9505fa54 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -656,7 +656,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
     auto aggregate_num_inserts = num_inserts;
     if (GraphViewType::is_multi_gpu) {
       auto& comm            = handle.get_comms();
-      aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, handle.get_stream());
+      aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream());
     }
 
     if (aggregate_num_inserts > 0) {
diff --git a/cpp/src/link_analysis/pagerank_impl.cuh b/cpp/src/link_analysis/pagerank_impl.cuh
index c3ccf4f4763..b6023d21bf2 100644
--- a/cpp/src/link_analysis/pagerank_impl.cuh
+++ b/cpp/src/link_analysis/pagerank_impl.cuh
@@ -69,12 +69,13 @@ void pagerank(
   if (num_vertices == 0) { return; }
 
   auto aggregate_personalization_vector_size =
-    personalization_vertices
-      ? GraphViewType::is_multi_gpu
-          ? host_scalar_allreduce(
-              handle.get_comms(), *personalization_vector_size, handle.get_stream())
-          : *personalization_vector_size
-      : vertex_t{0};
+    personalization_vertices ? GraphViewType::is_multi_gpu
+                                 ? host_scalar_allreduce(handle.get_comms(),
+                                                         *personalization_vector_size,
+                                                         raft::comms::op_t::SUM,
+                                                         handle.get_stream())
+                                 : *personalization_vector_size
+                             : vertex_t{0};
 
   // 1. check input arguments
 
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index e54d69204ac..23bf80adca0 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -268,8 +268,8 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                                          major_first, major_last, minor_first, minor_last}) == 0,
                       "Invalid input argument: edgelists[] have out-of-range values.");
     }
-    number_of_local_edges_sum =
-      host_scalar_allreduce(comm, number_of_local_edges_sum, default_stream_view.value());
+    number_of_local_edges_sum = host_scalar_allreduce(
+      comm, number_of_local_edges_sum, raft::comms::op_t::SUM, default_stream_view.value());
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
                     "Invalid input argument: the sum of local edge counts does not match with "
                     "meta.number_of_edges.");
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 368573d4a91..572fa0598fc 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -267,8 +267,10 @@ graph_view_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enabl
                          out_of_range_t<vertex_t>{minor_first, minor_last}) == 0,
         "Internal Error: adj_matrix_partition_indices[] have out-of-range vertex IDs.");
     }
-    number_of_local_edges_sum = host_scalar_allreduce(
-      this->get_handle_ptr()->get_comms(), number_of_local_edges_sum, default_stream_view.value());
+    number_of_local_edges_sum = host_scalar_allreduce(this->get_handle_ptr()->get_comms(),
+                                                      number_of_local_edges_sum,
+                                                      raft::comms::op_t::SUM,
+                                                      default_stream_view.value());
     CUGRAPH_EXPECTS(number_of_local_edges_sum == this->get_number_of_edges(),
                     "Internal Error: the sum of local edges counts does not match with "
                     "number_of_local_edges.");
diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 8104db8eebc..b6a0b3c98b2 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -669,6 +669,7 @@ renumber_edgelist(
   auto number_of_edges    = host_scalar_allreduce(
     comm,
     std::accumulate(edgelist_edge_counts.begin(), edgelist_edge_counts.end(), edge_t{0}),
+    raft::comms::op_t::SUM,
     handle.get_stream());
 
   // 3. renumber edges
diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh
index ed528bd3d34..c1b8260b0a3 100644
--- a/cpp/src/traversal/bfs_impl.cuh
+++ b/cpp/src/traversal/bfs_impl.cuh
@@ -71,7 +71,8 @@ void bfs(raft::handle_t const& handle,
 
   auto aggregate_n_sources =
     GraphViewType::is_multi_gpu
-      ? host_scalar_allreduce(handle.get_comms(), n_sources, handle.get_stream())
+      ? host_scalar_allreduce(
+          handle.get_comms(), n_sources, raft::comms::op_t::SUM, handle.get_stream())
       : n_sources;
   CUGRAPH_EXPECTS(aggregate_n_sources > 0,
                   "Invalid input argument: input should have at least one source");
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index bee78fb9952..2fe4bd3fe60 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -580,9 +580,9 @@ void call_pagerank(raft::handle_t const& handle,
                    bool has_guess)
 {
   if (graph_container.is_multi_gpu) {
-    auto& comm = handle.get_comms();
-    auto aggregate_personalization_subset_size =
-      cugraph::host_scalar_allreduce(comm, personalization_subset_size, handle.get_stream());
+    auto& comm                                 = handle.get_comms();
+    auto aggregate_personalization_subset_size = cugraph::host_scalar_allreduce(
+      comm, personalization_subset_size, raft::comms::op_t::SUM, handle.get_stream());
 
     if (graph_container.edgeType == numberTypeEnum::int32Type) {
       auto graph =

From 11a3053630f39c83878e168057e34bd6507b8fea Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 14:40:54 -0400
Subject: [PATCH 44/57] clang-format

---
 cpp/include/cugraph/prims/count_if_v.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cugraph/prims/count_if_v.cuh b/cpp/include/cugraph/prims/count_if_v.cuh
index 05f778c9a0c..f56da008eca 100644
--- a/cpp/include/cugraph/prims/count_if_v.cuh
+++ b/cpp/include/cugraph/prims/count_if_v.cuh
@@ -59,7 +59,8 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
                      vertex_value_input_first + graph_view.get_number_of_local_vertices(),
                      v_op);
   if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
+    count =
+      host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
   }
   return count;
 }
@@ -94,7 +95,8 @@ typename GraphViewType::vertex_type count_if_v(raft::handle_t const& handle,
 {
   auto count = thrust::count_if(handle.get_thrust_policy(), input_first, input_last, v_op);
   if (GraphViewType::is_multi_gpu) {
-    count = host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
+    count =
+      host_scalar_allreduce(handle.get_comms(), count, raft::comms::op_t::SUM, handle.get_stream());
   }
   return count;
 }

From 3c5bb2d593b86db5c5a35c3620067e5c75d8acc2 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 14:45:18 -0400
Subject: [PATCH 45/57] additional clang-format

---
 cpp/src/components/weakly_connected_components_impl.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 3fb9505fa54..66c9447605d 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -655,8 +655,9 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
     auto num_inserts           = num_edge_inserts.value(handle.get_stream_view());
     auto aggregate_num_inserts = num_inserts;
     if (GraphViewType::is_multi_gpu) {
-      auto& comm            = handle.get_comms();
-      aggregate_num_inserts = host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream());
+      auto& comm = handle.get_comms();
+      aggregate_num_inserts =
+        host_scalar_allreduce(comm, num_inserts, raft::comms::op_t::SUM, handle.get_stream());
     }
 
     if (aggregate_num_inserts > 0) {

From a85181822dc627f3ebadbcb8ea2416382f2bbf9f Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Wed, 15 Sep 2021 14:52:48 -0400
Subject: [PATCH 46/57] bug fix (max reduction instead of erroneous sum
 reduction)

---
 cpp/src/structure/graph_impl.cuh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index e1371586dd8..68c517dc599 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -413,10 +413,12 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   auto max_major_properties_fill_ratio = host_scalar_allreduce(
     comm,
     static_cast<double>(num_local_unique_edge_majors) / static_cast<double>(aggregate_major_size),
+    raft::comms::op_t::MAX,
     handle.get_stream());
   auto max_minor_properties_fill_ratio = host_scalar_allreduce(
     comm,
     static_cast<double>(num_local_unique_edge_minors) / static_cast<double>(minor_size),
+    raft::comms::op_t::MAX,
     handle.get_stream());
 
   if (max_major_properties_fill_ratio < detail::row_col_properties_kv_pair_fill_ratio_threshold) {

From 74ee5c4a7db6e3aed039e4c05dd868d6300094da Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 16 Sep 2021 08:33:54 -0400
Subject: [PATCH 47/57] bug fix

---
 cpp/src/structure/graph_impl.cuh | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 5789aefd0d7..206594c8348 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -264,7 +264,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
   // optional expensive checks (part 1/2)
 
   if (do_expensive_check) {
-    edge_t number_of_local_edges{};
+    edge_t number_of_local_edges{0};
     for (size_t i = 0; i < edgelists.size(); ++i) {
       auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
       auto [minor_first, minor_last] = partition_.get_matrix_partition_minor_range();
@@ -306,6 +306,7 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
                    p_minors,
                    p_minors + edgelists[i].number_of_edges,
                    minors.begin() + cur_size);
+      cur_size += edgelists[i].number_of_edges;
     }
     thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end());
     thrust::sort(handle.get_thrust_policy(), minors.begin(), minors.end());
@@ -315,14 +316,14 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
       minors.begin(), thrust::unique(handle.get_thrust_policy(), minors.begin(), minors.end())));
     if constexpr (store_transposed) {
       CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_cols,
-                      "Invalid input argument: num_unique_edge_cols is erroneous.");
+                      "Invalid input argument: num_local_unique_edge_cols is erroneous.");
       CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_rows,
-                      "Invalid input argument: num_unique_edge_rows is erroneous.");
+                      "Invalid input argument: num_local_unique_edge_rows is erroneous.");
     } else {
       CUGRAPH_EXPECTS(num_local_unique_edge_majors == meta.num_local_unique_edge_rows,
-                      "Invalid input argument: num_unique_edge_rows is erroneous.");
+                      "Invalid input argument: num_local_unique_edge_rows is erroneous.");
       CUGRAPH_EXPECTS(num_local_unique_edge_minors == meta.num_local_unique_edge_cols,
-                      "Invalid input argument: num_unique_edge_cols is erroneous.");
+                      "Invalid input argument: num_local_unique_edge_cols is erroneous.");
     }
   }
 

From bf5bb5c334f84ecb90e753e7b35e5bd29672a225 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 16 Sep 2021 08:35:36 -0400
Subject: [PATCH 48/57] update row/col properties classes

---
 .../copy_v_transform_reduce_in_out_nbr.cuh    |   4 +-
 ...ransform_reduce_key_aggregated_out_nbr.cuh |   2 +-
 .../cugraph/prims/row_col_properties.cuh      | 326 +++++++++++++++---
 ...orm_reduce_by_adj_matrix_row_col_key_e.cuh |   9 +-
 .../cugraph/prims/transform_reduce_e.cuh      |   4 +-
 .../update_frontier_v_push_if_out_nbr.cuh     |   2 +-
 cpp/src/community/louvain.cuh                 |  20 +-
 .../weakly_connected_components_impl.cuh      |   2 +-
 cpp/src/structure/coarsen_graph_impl.cuh      |   2 +-
 cpp/src/traversal/sssp_impl.cuh               |   2 +-
 10 files changed, 300 insertions(+), 73 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
index e208c13c412..6900c2a0d07 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_in_out_nbr.cuh
@@ -479,9 +479,9 @@ void copy_v_transform_reduce_nbr(raft::handle_t const& handle,
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
     auto matrix_partition_col_value_input = adj_matrix_col_value_input;
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
-      matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+      matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i);
     } else {
-      matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+      matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
     }
 
     std::conditional_t<
diff --git a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
index f21ffebde53..a11e757b797 100644
--- a/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/copy_v_transform_reduce_key_aggregated_out_nbr.cuh
@@ -624,7 +624,7 @@ void copy_v_transform_reduce_key_aggregated_out_nbr(
     auto tmp_e_op_result_buffer_first = get_dataframe_buffer_begin(tmp_e_op_result_buffer);
 
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
-    matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
 
     auto triplet_first = thrust::make_zip_iterator(thrust::make_tuple(
       tmp_major_vertices.begin(), tmp_minor_keys.begin(), tmp_key_aggregated_edge_weights.begin()));
diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index fbece29ceb0..ca6f16954ee 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -36,31 +36,116 @@ namespace cugraph {
 namespace detail {
 
 template <typename vertex_t, typename ValueIterator>
-class row_col_properties_device_view_t {
+class major_properties_device_view_t {
  public:
   using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
 
-  row_col_properties_device_view_t() = default;
+  major_properties_device_view_t() = default;
 
-  row_col_properties_device_view_t(ValueIterator value_first)
-    : key_first_(thrust::nullopt), key_last_(thrust::nullopt), key_offset_(0), value_first_(value_first)
+  major_properties_device_view_t(
+    ValueIterator value_first)  // for single-GPU only and for advanced users
+    : value_first_(value_first)
   {
   }
 
-  row_col_properties_device_view_t(vertex_t const* key_first,
-                                   vertex_t const* key_last,
-                                   ValueIterator value_first)
-    : key_first_(key_first), key_last_(key_last), key_offset_(0), value_first_(value_first)
+  major_properties_device_view_t(ValueIterator value_first,
+                                 vertex_t const* matrix_partition_major_value_start_offsets)
+    : value_first_(value_first),
+      matrix_partition_major_value_start_offsets_(matrix_partition_major_value_start_offsets)
   {
+    set_local_adj_matrix_partition_idx(size_t{0});
   }
 
-  void add_offset(vertex_t offset) {
+  major_properties_device_view_t(vertex_t const* key_first,
+                                 ValueIterator value_first,
+                                 vertex_t const* matrix_partition_key_offsets,
+                                 vertex_t const* matrix_partition_major_firsts)
+    : key_first_(key_first),
+      value_first_(value_first),
+      matrix_partition_key_offsets_(matrix_partition_key_offsets),
+      matrix_partition_major_firsts_(matrix_partition_major_firsts)
+  {
+    set_local_adj_matrix_partition_idx(size_t{0});
+  }
+
+  void set_local_adj_matrix_partition_idx(size_t adj_matrix_partition_idx)
+  {
     if (key_first_) {
-      *key_offset_ += offset;
+      matrix_partition_key_first_ =
+        *key_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx];
+      matrix_partition_key_last_ =
+        *key_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx + 1];
+      matrix_partition_major_first_ = (*matrix_partition_major_firsts_)[adj_matrix_partition_idx];
+      matrix_partition_value_first_ =
+        value_first_ + (*matrix_partition_key_offsets_)[adj_matrix_partition_idx];
+    } else {
+      if (matrix_partition_major_value_start_offsets_) {
+        matrix_partition_value_first_ =
+          value_first_ + (*matrix_partition_major_value_start_offsets_)[adj_matrix_partition_idx];
+      } else {
+        assert(adj_matrix_partition_idx == 0);
+        matrix_partition_value_first_ = value_first_;
+      }
     }
-    else {
-      value_first_ += offset;
+  }
+
+  ValueIterator value_data() const { return value_first_; }
+
+  __device__ ValueIterator get_iter(vertex_t offset) const
+  {
+    auto value_offset = offset;
+    if (matrix_partition_key_first_) {
+      auto it = thrust::lower_bound(thrust::seq,
+                                    *matrix_partition_key_first_,
+                                    *matrix_partition_key_last_,
+                                    *matrix_partition_major_first_ + offset);
+      assert((it != *matrix_partition_key_last_) &&
+             (*it == (*matrix_partition_major_first_ + offset)));
+      value_offset = static_cast<vertex_t>(thrust::distance(*matrix_partition_key_first_, it));
     }
+    return matrix_partition_value_first_ + value_offset;
+  }
+
+  __device__ value_type get(vertex_t offset) const { return *get_iter(offset); }
+
+ private:
+  thrust::optional<vertex_t const*> key_first_{thrust::nullopt};
+  ValueIterator value_first_{};
+
+  thrust::optional<vertex_t const*> matrix_partition_key_offsets_{thrust::nullopt};   // host data
+  thrust::optional<vertex_t const*> matrix_partition_major_firsts_{thrust::nullopt};  // host data
+
+  thrust::optional<vertex_t const*> matrix_partition_major_value_start_offsets_{
+    thrust::nullopt};  // host data
+
+  thrust::optional<vertex_t const*> matrix_partition_key_first_{thrust::nullopt};
+  thrust::optional<vertex_t const*> matrix_partition_key_last_{thrust::nullopt};
+  thrust::optional<vertex_t> matrix_partition_major_first_{thrust::nullopt};
+
+  ValueIterator matrix_partition_value_first_{};
+};
+
+template <typename vertex_t, typename ValueIterator>
+class minor_properties_device_view_t {
+ public:
+  using value_type = typename thrust::iterator_traits<ValueIterator>::value_type;
+
+  minor_properties_device_view_t() = default;
+
+  minor_properties_device_view_t(ValueIterator value_first)
+    : value_first_(value_first)
+  {
+  }
+
+  minor_properties_device_view_t(vertex_t const* key_first,
+                                 vertex_t const* key_last,
+                                 vertex_t minor_first,
+                                 ValueIterator value_first)
+    : key_first_(key_first),
+      key_last_(key_last),
+      minor_first_(minor_first),
+      value_first_(value_first)
+  {
   }
 
   ValueIterator value_data() const { return value_first_; }
@@ -69,8 +154,8 @@ class row_col_properties_device_view_t {
   {
     auto value_offset = offset;
     if (key_first_) {
-      auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, offset + *key_offset_);
-      assert((it != *key_last_) && (*it == (offset + *key_offset_)));
+      auto it = thrust::lower_bound(thrust::seq, *key_first_, *key_last_, *minor_first_ + offset);
+      assert((it != *key_last_) && (*it == (*minor_first_ + offset)));
       value_offset = static_cast<vertex_t>(thrust::distance(*key_first_, it));
     }
     return value_first_ + value_offset;
@@ -81,33 +166,105 @@ class row_col_properties_device_view_t {
  private:
   thrust::optional<vertex_t const*> key_first_{thrust::nullopt};
   thrust::optional<vertex_t const*> key_last_{thrust::nullopt};
-  thrust::optional<vertex_t> key_offset_{0};
+  thrust::optional<vertex_t> minor_first_{thrust::nullopt};
 
   ValueIterator value_first_{};
 };
 
 template <typename vertex_t, typename T>
-class row_col_properties_t {
+class major_properties_t {
  public:
-  row_col_properties_t()
-    : key_first_(std::nullopt),
-      key_last_(std::nullopt),
-      buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{}))
+  major_properties_t() : buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) {}
+
+  major_properties_t(raft::handle_t const& handle,
+                     vertex_t buffer_size,
+                     std::vector<vertex_t>&& matrix_partition_major_value_start_offsets)
+    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream())),
+      matrix_partition_major_value_start_offsets_(
+        std::move(matrix_partition_major_value_start_offsets))
+  {
+  }
+
+  major_properties_t(raft::handle_t const& handle,
+                     vertex_t const* key_first,
+                     std::vector<vertex_t>&& matrix_partition_key_offsets,
+                     std::vector<vertex_t>&& matrix_partition_major_firsts)
+    : key_first_(key_first),
+      buffer_(
+        allocate_dataframe_buffer<T>(matrix_partition_key_offsets.back(), handle.get_stream())),
+      matrix_partition_key_offsets_(std::move(matrix_partition_key_offsets)),
+      matrix_partition_major_firsts_(std::move(matrix_partition_major_firsts))
+  {
+  }
+
+  void fill(T value, rmm::cuda_stream_view stream)
+  {
+    thrust::fill(
+      rmm::exec_policy(stream), value_data(), value_data() + size_dataframe_buffer(buffer_), value);
+  }
+
+  auto key_first() { return key_first_; }
+  auto key_last() { return key_first_ + matrix_partition_key_offsets_.back(); }
+  auto value_data() { return get_dataframe_buffer_begin(buffer_); }
+
+  auto device_view() const
+  {
+    auto value_first = get_dataframe_buffer_cbegin(buffer_);
+    if (key_first_) {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_,
+        value_first,
+        (*matrix_partition_key_offsets_).data(),
+        (*matrix_partition_major_firsts_).data());
+    } else {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(
+        value_first, (*matrix_partition_major_value_start_offsets_).data());
+    }
+  }
+
+  auto mutable_device_view()
   {
+    auto value_first = get_dataframe_buffer_begin(buffer_);
+    if (key_first_) {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_,
+        value_first,
+        (*matrix_partition_key_offsets_).data(),
+        (*matrix_partition_major_firsts_).data());
+    } else {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(
+        value_first, (*matrix_partition_major_value_start_offsets_).data());
+    }
   }
 
-  row_col_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
-    : key_first_(std::nullopt),
-      key_last_(std::nullopt),
-      buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
+ private:
+  std::optional<vertex_t const*> key_first_{std::nullopt};
+
+  decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
+
+  std::optional<std::vector<vertex_t>> matrix_partition_key_offsets_{std::nullopt};
+  std::optional<std::vector<vertex_t>> matrix_partition_major_firsts_{std::nullopt};
+
+  std::optional<std::vector<vertex_t>> matrix_partition_major_value_start_offsets_{std::nullopt};
+};
+
+template <typename vertex_t, typename T>
+class minor_properties_t {
+ public:
+  minor_properties_t() : buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) {}
+
+  minor_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
+    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
   {
   }
 
-  row_col_properties_t(raft::handle_t const& handle,
-                       vertex_t const* key_first,
-                       vertex_t const* key_last)
+  minor_properties_t(raft::handle_t const& handle,
+                     vertex_t const* key_first,
+                     vertex_t const* key_last,
+                     vertex_t minor_first)
     : key_first_(key_first),
       key_last_(key_last),
+      minor_first_(minor_first),
       buffer_(
         allocate_dataframe_buffer<T>(thrust::distance(key_first, key_last), handle.get_stream()))
   {
@@ -127,10 +284,10 @@ class row_col_properties_t {
   {
     auto value_first = get_dataframe_buffer_cbegin(buffer_);
     if (key_first_) {
-      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(
-        *key_first_, *key_last_, value_first);
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_, *key_last_, *minor_first_, value_first);
     } else {
-      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
@@ -138,16 +295,17 @@ class row_col_properties_t {
   {
     auto value_first = get_dataframe_buffer_begin(buffer_);
     if (key_first_) {
-      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(
-        *key_first_, *key_last_, value_first);
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(
+        *key_first_, *key_last_, *minor_first_, value_first);
     } else {
-      return row_col_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
+      return minor_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
  private:
   std::optional<vertex_t const*> key_first_{std::nullopt};
   std::optional<vertex_t const*> key_last_{std::nullopt};
+  std::optional<vertex_t> minor_first_{std::nullopt};
 
   decltype(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) buffer_;
 };
@@ -181,14 +339,47 @@ class row_properties_t {
 
   row_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
+    using vertex_t = typename GraphViewType::vertex_type;
+
     auto key_first = graph_view.get_local_sorted_unique_edge_row_begin();
-    auto key_last  = graph_view.get_local_sorted_unique_edge_row_end();
     if (key_first) {
-      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
-        handle, *key_first, *key_last);
+      if constexpr (GraphViewType::is_multi_gpu) {
+        if constexpr (GraphViewType::is_adj_matrix_transposed) {
+          auto key_last = graph_view.get_local_sorted_unique_edge_row_end();
+          properties_   = detail::minor_properties_t<vertex_t, T>(
+            handle, *key_first, *key_last, graph_view.get_local_adj_matrix_partition_row_first());
+        } else {
+          std::vector<vertex_t> matrix_partition_major_firsts(
+            graph_view.get_number_of_local_adj_matrix_partitions());
+          for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+            matrix_partition_major_firsts[i] =
+              graph_view.get_local_adj_matrix_partition_row_first(i);
+          }
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle,
+            *key_first,
+            *(graph_view.get_local_sorted_unique_edge_row_offsets()),
+            std::move(matrix_partition_major_firsts));
+        }
+      } else {
+        assert(false);
+      }
     } else {
-      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
-        handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
+      if constexpr (GraphViewType::is_adj_matrix_transposed) {
+        properties_ = detail::minor_properties_t<vertex_t, T>(
+          handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
+      } else {
+        std::vector<vertex_t> matrix_partition_major_value_start_offsets(
+          graph_view.get_number_of_local_adj_matrix_partitions());
+        for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+          matrix_partition_major_value_start_offsets[i] =
+            graph_view.get_local_adj_matrix_partition_row_value_start_offset(i);
+        }
+        properties_ = detail::major_properties_t<vertex_t, T>(
+          handle,
+          graph_view.get_number_of_local_adj_matrix_partition_rows(),
+          std::move(matrix_partition_major_value_start_offsets));
+      }
     }
   }
 
@@ -203,7 +394,10 @@ class row_properties_t {
   auto mutable_device_view() { return properties_.mutable_device_view(); }
 
  private:
-  detail::row_col_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+  std::conditional_t<GraphViewType::is_adj_matrix_transposed,
+                     detail::minor_properties_t<typename GraphViewType::vertex_type, T>,
+                     detail::major_properties_t<typename GraphViewType::vertex_type, T>>
+    properties_{};
 };
 
 template <typename GraphViewType, typename T>
@@ -217,14 +411,47 @@ class col_properties_t {
 
   col_properties_t(raft::handle_t const& handle, GraphViewType const& graph_view)
   {
+    using vertex_t = typename GraphViewType::vertex_type;
+
     auto key_first = graph_view.get_local_sorted_unique_edge_col_begin();
-    auto key_last  = graph_view.get_local_sorted_unique_edge_col_end();
     if (key_first) {
-      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
-        handle, *key_first, *key_last);
+      if constexpr (GraphViewType::is_multi_gpu) {
+        if constexpr (GraphViewType::is_adj_matrix_transposed) {
+          std::vector<vertex_t> matrix_partition_major_firsts(
+            graph_view.get_number_of_local_adj_matrix_partitions());
+          for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+            matrix_partition_major_firsts[i] =
+              graph_view.get_local_adj_matrix_partition_col_first(i);
+          }
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle,
+            *key_first,
+            *(graph_view.get_local_sorted_unique_edge_col_offsets()),
+            std::move(matrix_partition_major_firsts));
+        } else {
+          auto key_last = graph_view.get_local_sorted_unique_edge_col_end();
+          properties_   = detail::minor_properties_t<vertex_t, T>(
+            handle, *key_first, *key_last, graph_view.get_local_adj_matrix_partition_col_first());
+        }
+      } else {
+        assert(false);
+      }
     } else {
-      properties_ = detail::row_col_properties_t<typename GraphViewType::vertex_type, T>(
-        handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
+      if constexpr (GraphViewType::is_adj_matrix_transposed) {
+        std::vector<vertex_t> matrix_partition_major_value_start_offsets(
+          graph_view.get_number_of_local_adj_matrix_partitions());
+        for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+          matrix_partition_major_value_start_offsets[i] =
+            graph_view.get_local_adj_matrix_partition_col_value_start_offset(i);
+        }
+        properties_ = detail::major_properties_t<vertex_t, T>(
+          handle,
+          graph_view.get_number_of_local_adj_matrix_partition_cols(),
+          std::move(matrix_partition_major_value_start_offsets));
+      } else {
+        properties_ = detail::minor_properties_t<vertex_t, T>(
+          handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
+      }
     }
   }
 
@@ -239,7 +466,10 @@ class col_properties_t {
   auto mutable_device_view() { return properties_.mutable_device_view(); }
 
  private:
-  detail::row_col_properties_t<typename GraphViewType::vertex_type, T> properties_{};
+  std::conditional_t<GraphViewType::is_adj_matrix_transposed,
+                     detail::major_properties_t<typename GraphViewType::vertex_type, T>,
+                     detail::minor_properties_t<typename GraphViewType::vertex_type, T>>
+    properties_{};
 };
 
 template <typename vertex_t>
@@ -247,7 +477,7 @@ class dummy_properties_device_view_t {
  public:
   using value_type = thrust::nullopt_t;
 
-  void add_offset(vertex_t offset) {}  // no-op
+  void set_local_adj_matrix_partition_idx(size_t adj_matrix_partition_idx) {}  // no-op
 
   __device__ auto get(vertex_t offset) const { return thrust::nullopt; }
 };
@@ -261,11 +491,11 @@ class dummy_properties_t {
 };
 
 template <typename vertex_t, typename... Ts>
-auto device_view_concat(detail::row_col_properties_device_view_t<vertex_t, Ts>... device_views)
+auto device_view_concat(detail::major_properties_device_view_t<vertex_t, Ts>... device_views)
 {
   auto concat_first = thrust::make_zip_iterator(
     thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...));
-  return detail::row_col_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
+  return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
 }
 
 }  // namespace cugraph
diff --git a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
index 79807b3728d..1ee2dd5b2d8 100644
--- a/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_by_adj_matrix_row_col_key_e.cuh
@@ -392,17 +392,14 @@ transform_reduce_by_adj_matrix_row_col_key_e(
       auto matrix_partition_row_value_input = adj_matrix_row_value_input;
       auto matrix_partition_col_value_input = adj_matrix_col_value_input;
       if constexpr (GraphViewType::is_adj_matrix_transposed) {
-        matrix_partition_col_value_input.add_offset(
-          matrix_partition.get_major_value_start_offset());
+        matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i);
       } else {
-        matrix_partition_row_value_input.add_offset(
-          matrix_partition.get_major_value_start_offset());
+        matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
       }
       auto matrix_partition_row_col_key_input = adj_matrix_row_col_key_input;
       if constexpr ((adj_matrix_row_key && !GraphViewType::is_adj_matrix_transposed) ||
                     (!adj_matrix_row_key && GraphViewType::is_adj_matrix_transposed)) {
-        matrix_partition_row_col_key_input.add_offset(
-          matrix_partition.get_major_value_start_offset());
+        matrix_partition_row_col_key_input.set_local_adj_matrix_partition_idx(i);
       }
 
       auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
diff --git a/cpp/include/cugraph/prims/transform_reduce_e.cuh b/cpp/include/cugraph/prims/transform_reduce_e.cuh
index 990730dee32..275fa11a95e 100644
--- a/cpp/include/cugraph/prims/transform_reduce_e.cuh
+++ b/cpp/include/cugraph/prims/transform_reduce_e.cuh
@@ -424,9 +424,9 @@ T transform_reduce_e(raft::handle_t const& handle,
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
     auto matrix_partition_col_value_input = adj_matrix_col_value_input;
     if constexpr (GraphViewType::is_adj_matrix_transposed) {
-      matrix_partition_col_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+      matrix_partition_col_value_input.set_local_adj_matrix_partition_idx(i);
     } else {
-      matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+      matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
     }
 
     auto segment_offsets = graph_view.get_local_adj_matrix_partition_segment_offsets(i);
diff --git a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
index 422bfbd82fa..c84fcd19ce4 100644
--- a/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
+++ b/cpp/include/cugraph/prims/update_frontier_v_push_if_out_nbr.cuh
@@ -980,7 +980,7 @@ void update_frontier_v_push_if_out_nbr(
 
     auto matrix_partition_row_value_input = adj_matrix_row_value_input;
     auto matrix_partition_col_value_input = adj_matrix_col_value_input;
-    matrix_partition_row_value_input.add_offset(matrix_partition.get_major_value_start_offset());
+    matrix_partition_row_value_input.set_local_adj_matrix_partition_idx(i);
 
     if (segment_offsets) {
       static_assert(detail::num_sparse_segments_per_vertex_partition == 3);
diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 9b774bc3f4d..23adca97342 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -260,11 +260,11 @@ class Louvain {
       current_graph_view_,
       graph_view_t::is_multi_gpu
         ? src_clusters_cache_.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.begin()),
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.begin()),
       [] __device__(auto, auto, weight_t wt, auto src_cluster, auto nbr_cluster) {
         if (src_cluster == nbr_cluster) {
@@ -396,11 +396,11 @@ class Louvain {
       current_graph_view_,
       graph_view_t::is_multi_gpu
         ? src_clusters_cache_.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       [] __device__(auto src, auto dst, auto wt, auto src_cluster, auto nbr_cluster) {
         weight_t sum{0};
@@ -491,13 +491,13 @@ class Louvain {
                              src_cluster_weights.device_view(),
                              src_old_cluster_sum_subtract_pairs.device_view())
         : device_view_concat(
-            detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(
+            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_weights_v_.data()),
-            detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+            detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
               next_clusters_v_.data()),
-            detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(
+            detail::major_properties_device_view_t<vertex_t, weight_t const*>(
               vertex_cluster_weights_v.data()),
-            detail::row_col_properties_device_view_t<vertex_t,
+            detail::major_properties_device_view_t<vertex_t,
                                                    decltype(cluster_old_sum_subtract_pair_first)>(
               cluster_old_sum_subtract_pair_first));
 
@@ -507,7 +507,7 @@ class Louvain {
       zipped_src_device_view,
       graph_view_t::is_multi_gpu
         ? dst_clusters_cache_.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+        : detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(
             next_clusters_v_.data()),
       cluster_keys_v_.begin(),
       cluster_keys_v_.end(),
@@ -539,7 +539,7 @@ class Louvain {
         dummy_properties_t<vertex_t>{}.device_view(),
         graph_view_t::is_multi_gpu
           ? src_clusters_cache_.device_view()
-          : detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(
+          : detail::major_properties_device_view_t<vertex_t, vertex_t const*>(
               next_clusters_v_.data()),
         detail::return_edge_weight_t<vertex_t, weight_t>{},
         weight_t{0});
diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 1f8408b4f4a..0b672151708 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -544,7 +544,7 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
         [col_components =
            GraphViewType::is_multi_gpu
              ? adj_matrix_col_components.mutable_device_view()
-             : detail::row_col_properties_device_view_t<vertex_t, vertex_t*>(level_components),
+             : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
          col_first = level_graph_view.get_local_adj_matrix_partition_col_first(),
          edge_buffer_first =
            get_dataframe_buffer_begin(edge_buffer),
diff --git a/cpp/src/structure/coarsen_graph_impl.cuh b/cpp/src/structure/coarsen_graph_impl.cuh
index 3d9e99a6fea..5e66b1c7667 100644
--- a/cpp/src/structure/coarsen_graph_impl.cuh
+++ b/cpp/src/structure/coarsen_graph_impl.cuh
@@ -494,7 +494,7 @@ coarsen_graph(
       matrix_partition_device_view_t<vertex_t, edge_t, weight_t, multi_gpu>(
         graph_view.get_matrix_partition_view()),
       labels,
-      detail::row_col_properties_device_view_t<vertex_t, vertex_t const*>(labels),
+      detail::minor_properties_device_view_t<vertex_t, vertex_t const*>(labels),
       graph_view.get_local_adj_matrix_partition_segment_offsets(0));
 
   rmm::device_uvector<vertex_t> unique_labels(graph_view.get_number_of_vertices(),
diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh
index 3a95fdb8fbc..ba91d485d65 100644
--- a/cpp/src/traversal/sssp_impl.cuh
+++ b/cpp/src/traversal/sssp_impl.cuh
@@ -167,7 +167,7 @@ void sssp(raft::handle_t const& handle,
       std::vector<size_t>{static_cast<size_t>(Bucket::next_near), static_cast<size_t>(Bucket::far)},
       GraphViewType::is_multi_gpu
         ? adj_matrix_row_distances.device_view()
-        : detail::row_col_properties_device_view_t<vertex_t, weight_t const*>(distances),
+        : detail::major_properties_device_view_t<vertex_t, weight_t const*>(distances),
       dummy_properties_t<vertex_t>{}.device_view(),
       [vertex_partition, distances, cutoff] __device__(
         vertex_t src, vertex_t dst, weight_t w, auto src_val, auto) {

From d5ee1462f571c1bcc0c101824821a5e19385367a Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Thu, 16 Sep 2021 11:13:07 -0400
Subject: [PATCH 49/57] temp debug printouts

---
 cpp/src/community/louvain.cuh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index 23adca97342..dfa0c9725e8 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -175,6 +175,7 @@ class Louvain {
       compute_vertex_and_cluster_weights();
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution);
+std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_->num_levels()=" << dendrogram_->num_levels() << " max_level=" << max_level << " total_edge_weight=" << total_edge_weight << " resolution=" << resolution << std::endl;
 
       if (new_Q <= best_modularity) { break; }
 
@@ -355,6 +356,7 @@ class Louvain {
     }
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
+std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q << std::endl;
     weight_t cur_Q = new_Q - 1;
 
     // To avoid the potential of having two vertices swap clusters
@@ -370,6 +372,7 @@ class Louvain {
       up_down = !up_down;
 
       new_Q = modularity(total_edge_weight, resolution);
+std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << new_Q << std::endl;
 
       if (new_Q > cur_Q) {
         raft::copy(dendrogram_->current_level_begin(),
@@ -523,6 +526,7 @@ class Louvain {
                       cugraph::get_dataframe_buffer_begin(output_buffer),
                       next_clusters_v_.begin(),
                       detail::cluster_update_op_t<vertex_t, weight_t>{up_down});
+raft::print_device_vector("new next_clusters_v_", next_clusters_v_.data(), next_clusters_v_.size(), std::cout);
 
     if constexpr (graph_view_t::is_multi_gpu) {
       copy_to_adj_matrix_row(

From e9b850a245e04296e24e6716165e8df84ea738d7 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 17 Sep 2021 06:42:02 -0400
Subject: [PATCH 50/57] undo debug printouts

---
 cpp/src/community/louvain.cuh | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index dfa0c9725e8..23adca97342 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -175,7 +175,6 @@ class Louvain {
       compute_vertex_and_cluster_weights();
 
       weight_t new_Q = update_clustering(total_edge_weight, resolution);
-std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_->num_levels()=" << dendrogram_->num_levels() << " max_level=" << max_level << " total_edge_weight=" << total_edge_weight << " resolution=" << resolution << std::endl;
 
       if (new_Q <= best_modularity) { break; }
 
@@ -356,7 +355,6 @@ std::cout << graph_view_t::is_multi_gpu << " new_Q=" << new_Q << " dendrogram_->
     }
 
     weight_t new_Q = modularity(total_edge_weight, resolution);
-std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q << std::endl;
     weight_t cur_Q = new_Q - 1;
 
     // To avoid the potential of having two vertices swap clusters
@@ -372,7 +370,6 @@ std::cout << graph_view_t::is_multi_gpu << "update_clustering new_Q=" << new_Q <
       up_down = !up_down;
 
       new_Q = modularity(total_edge_weight, resolution);
-std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << new_Q << std::endl;
 
       if (new_Q > cur_Q) {
         raft::copy(dendrogram_->current_level_begin(),
@@ -526,7 +523,6 @@ std::cout << graph_view_t::is_multi_gpu << "update_clustering loop new_Q=" << ne
                       cugraph::get_dataframe_buffer_begin(output_buffer),
                       next_clusters_v_.begin(),
                       detail::cluster_update_op_t<vertex_t, weight_t>{up_down});
-raft::print_device_vector("new next_clusters_v_", next_clusters_v_.data(), next_clusters_v_.size(), std::cout);
 
     if constexpr (graph_view_t::is_multi_gpu) {
       copy_to_adj_matrix_row(

From ec02b9d8ecabbd1c79e07da5c1c21e1d8f9c0809 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 17 Sep 2021 13:45:36 -0400
Subject: [PATCH 51/57] bug fix

---
 .../cugraph/prims/row_col_properties.cuh      | 113 ++++++++++++++----
 1 file changed, 87 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index ca6f16954ee..73358f64dd2 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -46,6 +46,7 @@ class major_properties_device_view_t {
     ValueIterator value_first)  // for single-GPU only and for advanced users
     : value_first_(value_first)
   {
+    set_local_adj_matrix_partition_idx(size_t{0});
   }
 
   major_properties_device_view_t(ValueIterator value_first,
@@ -89,8 +90,34 @@ class major_properties_device_view_t {
     }
   }
 
+  std::optional<vertex_t const*> key_data() const
+  {
+    return key_first_ ? std::optional<vertex_t const*>{*key_first_} : std::nullopt;
+  }
+
   ValueIterator value_data() const { return value_first_; }
 
+  std::optional<vertex_t const*> matrix_partition_key_offsets() const
+  {
+    return matrix_partition_key_offsets_
+             ? std::optional<vertex_t const*>{*matrix_partition_key_offsets_}
+             : std::nullopt;
+  }
+
+  std::optional<vertex_t const*> matrix_partition_major_firsts() const
+  {
+    return matrix_partition_major_firsts_
+             ? std::optional<vertex_t const*>{*matrix_partition_major_firsts_}
+             : std::nullopt;
+  }
+
+  std::optional<vertex_t const*> matrix_partition_major_value_start_offsets() const
+  {
+    return matrix_partition_major_value_start_offsets_
+             ? std::optional<vertex_t const*>{*matrix_partition_major_value_start_offsets_}
+             : std::nullopt;
+  }
+
   __device__ ValueIterator get_iter(vertex_t offset) const
   {
     auto value_offset = offset;
@@ -132,10 +159,7 @@ class minor_properties_device_view_t {
 
   minor_properties_device_view_t() = default;
 
-  minor_properties_device_view_t(ValueIterator value_first)
-    : value_first_(value_first)
-  {
-  }
+  minor_properties_device_view_t(ValueIterator value_first) : value_first_(value_first) {}
 
   minor_properties_device_view_t(vertex_t const* key_first,
                                  vertex_t const* key_last,
@@ -176,6 +200,11 @@ class major_properties_t {
  public:
   major_properties_t() : buffer_(allocate_dataframe_buffer<T>(0, rmm::cuda_stream_view{})) {}
 
+  major_properties_t(raft::handle_t const& handle, vertex_t buffer_size)
+    : buffer_(allocate_dataframe_buffer<T>(buffer_size, handle.get_stream()))
+  {
+  }
+
   major_properties_t(raft::handle_t const& handle,
                      vertex_t buffer_size,
                      std::vector<vertex_t>&& matrix_partition_major_value_start_offsets)
@@ -216,9 +245,11 @@ class major_properties_t {
         value_first,
         (*matrix_partition_key_offsets_).data(),
         (*matrix_partition_major_firsts_).data());
-    } else {
+    } else if (matrix_partition_major_value_start_offsets_) {
       return major_properties_device_view_t<vertex_t, decltype(value_first)>(
         value_first, (*matrix_partition_major_value_start_offsets_).data());
+    } else {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
@@ -231,9 +262,11 @@ class major_properties_t {
         value_first,
         (*matrix_partition_key_offsets_).data(),
         (*matrix_partition_major_firsts_).data());
-    } else {
+    } else if (matrix_partition_major_value_start_offsets_) {
       return major_properties_device_view_t<vertex_t, decltype(value_first)>(
         value_first, (*matrix_partition_major_value_start_offsets_).data());
+    } else {
+      return major_properties_device_view_t<vertex_t, decltype(value_first)>(value_first);
     }
   }
 
@@ -326,6 +359,12 @@ auto to_thrust_tuple(Iterator iter)
   return iter.get_iterator_tuple();
 }
 
+template <typename T, typename... Ts>
+decltype(auto) get_first_of_pack(T&& t, Ts&&...)
+{
+  return std::forward<T>(t);
+}
+
 }  // namespace detail
 
 template <typename GraphViewType, typename T>
@@ -369,16 +408,21 @@ class row_properties_t {
         properties_ = detail::minor_properties_t<vertex_t, T>(
           handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
       } else {
-        std::vector<vertex_t> matrix_partition_major_value_start_offsets(
-          graph_view.get_number_of_local_adj_matrix_partitions());
-        for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-          matrix_partition_major_value_start_offsets[i] =
-            graph_view.get_local_adj_matrix_partition_row_value_start_offset(i);
+        if constexpr (GraphViewType::is_multi_gpu) {
+          std::vector<vertex_t> matrix_partition_major_value_start_offsets(
+            graph_view.get_number_of_local_adj_matrix_partitions());
+          for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+            matrix_partition_major_value_start_offsets[i] =
+              graph_view.get_local_adj_matrix_partition_row_value_start_offset(i);
+          }
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle,
+            graph_view.get_number_of_local_adj_matrix_partition_rows(),
+            std::move(matrix_partition_major_value_start_offsets));
+        } else {
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle, graph_view.get_number_of_local_adj_matrix_partition_rows());
         }
-        properties_ = detail::major_properties_t<vertex_t, T>(
-          handle,
-          graph_view.get_number_of_local_adj_matrix_partition_rows(),
-          std::move(matrix_partition_major_value_start_offsets));
       }
     }
   }
@@ -438,16 +482,21 @@ class col_properties_t {
       }
     } else {
       if constexpr (GraphViewType::is_adj_matrix_transposed) {
-        std::vector<vertex_t> matrix_partition_major_value_start_offsets(
-          graph_view.get_number_of_local_adj_matrix_partitions());
-        for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
-          matrix_partition_major_value_start_offsets[i] =
-            graph_view.get_local_adj_matrix_partition_col_value_start_offset(i);
+        if constexpr (GraphViewType::is_multi_gpu) {
+          std::vector<vertex_t> matrix_partition_major_value_start_offsets(
+            graph_view.get_number_of_local_adj_matrix_partitions());
+          for (size_t i = 0; i < graph_view.get_number_of_local_adj_matrix_partitions(); ++i) {
+            matrix_partition_major_value_start_offsets[i] =
+              graph_view.get_local_adj_matrix_partition_col_value_start_offset(i);
+          }
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle,
+            graph_view.get_number_of_local_adj_matrix_partition_cols(),
+            std::move(matrix_partition_major_value_start_offsets));
+        } else {
+          properties_ = detail::major_properties_t<vertex_t, T>(
+            handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
         }
-        properties_ = detail::major_properties_t<vertex_t, T>(
-          handle,
-          graph_view.get_number_of_local_adj_matrix_partition_cols(),
-          std::move(matrix_partition_major_value_start_offsets));
       } else {
         properties_ = detail::minor_properties_t<vertex_t, T>(
           handle, graph_view.get_number_of_local_adj_matrix_partition_cols());
@@ -491,11 +540,23 @@ class dummy_properties_t {
 };
 
 template <typename vertex_t, typename... Ts>
-auto device_view_concat(detail::major_properties_device_view_t<vertex_t, Ts>... device_views)
+auto device_view_concat(detail::major_properties_device_view_t<vertex_t, Ts> const&... device_views)
 {
   auto concat_first = thrust::make_zip_iterator(
     thrust_tuple_cat(detail::to_thrust_tuple(device_views.value_data())...));
-  return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
+  auto first = detail::get_first_of_pack(device_views...);
+  if (first.key_data()) {
+    return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(
+      *(first.key_data()),
+      concat_first,
+      *(first.matrix_partition_key_offsets()),
+      *(first.matrix_partition_major_firsts()));
+  } else if (first.matrix_partition_major_value_start_offsets()) {
+    return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(
+      concat_first, *(first.matrix_partition_major_value_start_offsets()));
+  } else {
+    return detail::major_properties_device_view_t<vertex_t, decltype(concat_first)>(concat_first);
+  }
 }
 
 }  // namespace cugraph

From 82b6f75d790ee4286d4ee743fc06db71cae25d0e Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 17 Sep 2021 15:35:58 -0400
Subject: [PATCH 52/57] bug fix

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 301edae3173..655a1b5fba5 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -754,7 +754,7 @@ renumber_edgelist(
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
     vertex_t max_segment_size{0};
-    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+    for (size_t i = 0; i < row_comm_size; ++i) {
       max_segment_size = std::max(
         max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i));
     }

From 2007fe1e6eee1eb0d81ac65fff7ece07e47fcb77 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Fri, 17 Sep 2021 15:37:08 -0400
Subject: [PATCH 53/57] fix compiler warning

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 655a1b5fba5..6c13a44652d 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -754,7 +754,7 @@ renumber_edgelist(
       edgelist_intra_partition_segment_offsets) {  // memory footprint dominated by the O(V/sqrt(P))
                                                    // part than the O(E/P) part
     vertex_t max_segment_size{0};
-    for (size_t i = 0; i < row_comm_size; ++i) {
+    for (int i = 0; i < row_comm_size; ++i) {
       max_segment_size = std::max(
         max_segment_size, partition.get_vertex_partition_size(col_comm_rank * row_comm_size + i));
     }

From e264dc2032b08a5dd6fef7aaef6c2e1a43a4a679 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sat, 18 Sep 2021 12:51:09 -0400
Subject: [PATCH 54/57] bug fix

---
 .../prims/copy_to_adj_matrix_row_col.cuh       |  4 ++--
 .../cugraph/prims/row_col_properties.cuh       |  7 ++++++-
 cpp/src/structure/graph_impl.cuh               | 18 ++++++++++++++++--
 cpp/src/structure/graph_view_impl.cuh          |  2 +-
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
index b9c54c758c1..7100e7c8663 100644
--- a/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
+++ b/cpp/include/cugraph/prims/copy_to_adj_matrix_row_col.cuh
@@ -240,7 +240,7 @@ void copy_to_matrix_major(raft::handle_t const& handle,
              matrix_major_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) {
             auto major = *(output_key_first + i);
             auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, major);
-            if (*it == major) {
+            if ((it != rx_vertex_last) && (*it == major)) {
               auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
               *(output_value_first + i) = rx_value;
             }
@@ -479,7 +479,7 @@ void copy_to_matrix_minor(raft::handle_t const& handle,
              matrix_minor_value_output.value_data() + (*key_offsets)[i]] __device__(auto i) {
             auto minor = *(output_key_first + i);
             auto it    = thrust::lower_bound(thrust::seq, rx_vertex_first, rx_vertex_last, minor);
-            if (*it == minor) {
+            if ((it != rx_vertex_last) && (*it == minor)) {
               auto rx_value             = *(rx_value_first + thrust::distance(rx_vertex_first, it));
               *(output_value_first + i) = rx_value;
             }
diff --git a/cpp/include/cugraph/prims/row_col_properties.cuh b/cpp/include/cugraph/prims/row_col_properties.cuh
index 73358f64dd2..ec219f5290f 100644
--- a/cpp/include/cugraph/prims/row_col_properties.cuh
+++ b/cpp/include/cugraph/prims/row_col_properties.cuh
@@ -233,7 +233,12 @@ class major_properties_t {
   }
 
   auto key_first() { return key_first_; }
-  auto key_last() { return key_first_ + matrix_partition_key_offsets_.back(); }
+  auto key_last()
+  {
+    return key_first_ ? std::make_optional<vertex_t const*>(*key_first_ +
+                                                            (*matrix_partition_key_offsets_).back())
+                      : std::nullopt;
+  }
   auto value_data() { return get_dataframe_buffer_begin(buffer_); }
 
   auto device_view() const
diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh
index 206594c8348..4e393e43504 100644
--- a/cpp/src/structure/graph_impl.cuh
+++ b/cpp/src/structure/graph_impl.cuh
@@ -50,7 +50,7 @@ struct out_of_range_t {
   vertex_t minor_first{};
   vertex_t minor_last{};
 
-  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> t)
+  __device__ bool operator()(thrust::tuple<vertex_t, vertex_t> t) const
   {
     auto major = thrust::get<0>(t);
     auto minor = thrust::get<1>(t);
@@ -428,14 +428,28 @@ graph_t<vertex_t, edge_t, weight_t, store_transposed, multi_gpu, std::enable_if_
     size_t cur_size{0};
     for (size_t i = 0; i < adj_matrix_partition_offsets_.size(); ++i) {
       auto [major_first, major_last] = partition_.get_matrix_partition_major_range(i);
+      auto major_hypersparse_first =
+        use_dcs ? std::optional<vertex_t>{major_first +
+                                          (*adj_matrix_partition_segment_offsets_)
+                                            [(*(meta.segment_offsets)).size() * i +
+                                             detail::num_sparse_segments_per_vertex_partition]}
+                : std::nullopt;
       cur_size += thrust::distance(
         local_sorted_unique_edge_majors.data() + cur_size,
         thrust::copy_if(
           handle.get_thrust_policy(),
           thrust::make_counting_iterator(major_first),
-          thrust::make_counting_iterator(major_last),
+          thrust::make_counting_iterator(use_dcs ? *major_hypersparse_first : major_last),
           local_sorted_unique_edge_majors.data() + cur_size,
           has_nzd_t<vertex_t, edge_t>{adj_matrix_partition_offsets_[i].data(), major_first}));
+      if (use_dcs) {
+        thrust::copy(handle.get_thrust_policy(),
+                     (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin(),
+                     (*adj_matrix_partition_dcs_nzd_vertices_)[i].begin() +
+                       (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i],
+                     local_sorted_unique_edge_majors.data() + cur_size);
+        cur_size += (*adj_matrix_partition_dcs_nzd_vertex_counts_)[i];
+      }
     }
     assert(cur_size == num_local_unique_edge_majors);
 
diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh
index 87c7351f07a..41d43ab27a8 100644
--- a/cpp/src/structure/graph_view_impl.cuh
+++ b/cpp/src/structure/graph_view_impl.cuh
@@ -49,7 +49,7 @@ struct out_of_range_t {
   vertex_t min{};
   vertex_t max{};
 
-  __device__ bool operator()(vertex_t v) { return (v < min) || (v >= max); }
+  __device__ bool operator()(vertex_t v) const { return (v < min) || (v >= max); }
 };
 
 template <typename vertex_t, typename edge_t>

From 28e0741b89e3fb274fa177eaf7fd9c9f025f3fc1 Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sun, 19 Sep 2021 14:41:56 -0400
Subject: [PATCH 55/57] clnag-format

---
 cpp/src/components/weakly_connected_components_impl.cuh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh
index 0b672151708..66c9447605d 100644
--- a/cpp/src/components/weakly_connected_components_impl.cuh
+++ b/cpp/src/components/weakly_connected_components_impl.cuh
@@ -545,9 +545,8 @@ void weakly_connected_components_impl(raft::handle_t const& handle,
            GraphViewType::is_multi_gpu
              ? adj_matrix_col_components.mutable_device_view()
              : detail::minor_properties_device_view_t<vertex_t, vertex_t*>(level_components),
-         col_first = level_graph_view.get_local_adj_matrix_partition_col_first(),
-         edge_buffer_first =
-           get_dataframe_buffer_begin(edge_buffer),
+         col_first         = level_graph_view.get_local_adj_matrix_partition_col_first(),
+         edge_buffer_first = get_dataframe_buffer_begin(edge_buffer),
          num_edge_inserts =
            num_edge_inserts.data()] __device__(auto tagged_src, vertex_t dst, auto, auto) {
           auto tag        = thrust::get<1>(tagged_src);

From f7af95b7bcde7b83649da9c8302520f3a1ff3ddc Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sun, 19 Sep 2021 15:55:42 -0400
Subject: [PATCH 56/57] adjust variable scope to free memory buffer when
 unnecessary

---
 cpp/src/structure/renumber_edgelist_impl.cuh | 73 +++++++++++---------
 1 file changed, 39 insertions(+), 34 deletions(-)

diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh
index 6c13a44652d..ec6c0696694 100644
--- a/cpp/src/structure/renumber_edgelist_impl.cuh
+++ b/cpp/src/structure/renumber_edgelist_impl.cuh
@@ -702,42 +702,47 @@ renumber_edgelist(
   comm.barrier();  // currently, this is ncclAllReduce
 #endif
 
-  vertex_t max_matrix_partition_major_size{0};
-  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
-    max_matrix_partition_major_size =
-      std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i));
-  }
-  rmm::device_uvector<vertex_t> renumber_map_major_labels(max_matrix_partition_major_size,
-                                                          handle.get_stream());
-  for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
-    device_bcast(col_comm,
-                 renumber_map_labels.data(),
-                 renumber_map_major_labels.data(),
-                 partition.get_matrix_partition_major_size(i),
-                 i,
-                 handle.get_stream());
+  {
+    vertex_t max_matrix_partition_major_size{0};
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      max_matrix_partition_major_size =
+        std::max(max_matrix_partition_major_size, partition.get_matrix_partition_major_size(i));
+    }
+    rmm::device_uvector<vertex_t> renumber_map_major_labels(max_matrix_partition_major_size,
+                                                            handle.get_stream());
+    for (size_t i = 0; i < edgelist_major_vertices.size(); ++i) {
+      device_bcast(col_comm,
+                   renumber_map_labels.data(),
+                   renumber_map_major_labels.data(),
+                   partition.get_matrix_partition_major_size(i),
+                   i,
+                   handle.get_stream());
 
-    CUDA_TRY(cudaStreamSynchronize(
-      handle.get_stream()));  // cuco::static_map currently does not take stream
+      CUDA_TRY(cudaStreamSynchronize(
+        handle.get_stream()));  // cuco::static_map currently does not take stream
 
-    auto poly_alloc = rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
-    auto stream_adapter = rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
-    cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
-      renumber_map{
-        // cuco::static_map requires at least one empty slot
-        std::max(static_cast<size_t>(
-                   static_cast<double>(partition.get_matrix_partition_major_size(i)) / load_factor),
-                 static_cast<size_t>(partition.get_matrix_partition_major_size(i)) + 1),
-        invalid_vertex_id<vertex_t>::value,
-        invalid_vertex_id<vertex_t>::value,
-        stream_adapter};
-    auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
-      renumber_map_major_labels.begin(),
-      thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i))));
-    renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
-    renumber_map.find(edgelist_major_vertices[i],
-                      edgelist_major_vertices[i] + edgelist_edge_counts[i],
-                      edgelist_major_vertices[i]);
+      auto poly_alloc =
+        rmm::mr::polymorphic_allocator<char>(rmm::mr::get_current_device_resource());
+      auto stream_adapter =
+        rmm::mr::make_stream_allocator_adaptor(poly_alloc, cudaStream_t{nullptr});
+      cuco::static_map<vertex_t, vertex_t, cuda::thread_scope_device, decltype(stream_adapter)>
+        renumber_map{
+          // cuco::static_map requires at least one empty slot
+          std::max(
+            static_cast<size_t>(static_cast<double>(partition.get_matrix_partition_major_size(i)) /
+                                load_factor),
+            static_cast<size_t>(partition.get_matrix_partition_major_size(i)) + 1),
+          invalid_vertex_id<vertex_t>::value,
+          invalid_vertex_id<vertex_t>::value,
+          stream_adapter};
+      auto pair_first = thrust::make_zip_iterator(thrust::make_tuple(
+        renumber_map_major_labels.begin(),
+        thrust::make_counting_iterator(partition.get_matrix_partition_major_first(i))));
+      renumber_map.insert(pair_first, pair_first + partition.get_matrix_partition_major_size(i));
+      renumber_map.find(edgelist_major_vertices[i],
+                        edgelist_major_vertices[i] + edgelist_edge_counts[i],
+                        edgelist_major_vertices[i]);
+    }
   }
 
   // barrier is necessary here to avoid potential overlap (which can leads to deadlock) between two

From 78cfbda5427ac20b78295ab83860736a9f88f32d Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <seunghwak@nvidia.com>
Date: Sun, 19 Sep 2021 17:36:17 -0400
Subject: [PATCH 57/57] disable (key, value) pairs

---
 cpp/include/cugraph/graph_view.hpp       |  4 +--
 cpp/include/cugraph/utilities/cython.hpp |  9 ------
 cpp/src/utilities/cython.cu              | 41 ++++++++++++------------
 3 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp
index 2475cb71995..fa04852133b 100644
--- a/cpp/include/cugraph/graph_view.hpp
+++ b/cpp/include/cugraph/graph_view.hpp
@@ -223,10 +223,10 @@ namespace detail {
 
 using namespace cugraph::visitors;
 
-// FIXME: threshold values require tuning
+// FIXME: threshold values require tuning (currently disabled)
 // use (key, value) pairs to store row/column properties if (unique edge rows/cols) over (V /
 // row_comm_size|col_comm_size) is smaller than the threshold value
-double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.25;
+double constexpr row_col_properties_kv_pair_fill_ratio_threshold = 0.0;
 
 // FIXME: threshold values require tuning
 // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller
diff --git a/cpp/include/cugraph/utilities/cython.hpp b/cpp/include/cugraph/utilities/cython.hpp
index 37000bd57e7..3a4f437bfd0 100644
--- a/cpp/include/cugraph/utilities/cython.hpp
+++ b/cpp/include/cugraph/utilities/cython.hpp
@@ -102,8 +102,6 @@ struct graph_container_t {
   size_t num_local_edges;
   size_t num_global_vertices;
   size_t num_global_edges;
-  size_t num_local_unique_edge_rows{};
-  size_t num_local_unique_edge_cols{};
   numberTypeEnum vertexType;
   numberTypeEnum edgeType;
   numberTypeEnum weightType;
@@ -272,9 +270,6 @@ struct renum_tuple_t {
     return std::make_unique<std::vector<vertex_t>>(segment_offsets_);
   }
 
-  vertex_t& get_num_unique_edge_majors(void) { return num_unique_edge_majors_; }
-  vertex_t& get_num_unique_edge_minors(void) { return num_unique_edge_minors_; }
-
   // `partition_t` pass-through getters
   //
   int get_part_row_size() const { return part_.get_row_size(); }
@@ -369,8 +364,6 @@ struct renum_tuple_t {
   vertex_t nv_{0};
   edge_t ne_{0};
   std::vector<vertex_t> segment_offsets_;
-  vertex_t num_unique_edge_majors_{0};
-  vertex_t num_unique_edge_minors_{0};
 };
 
 // FIXME: finish description for vertex_partition_offsets
@@ -438,8 +431,6 @@ void populate_graph_container(graph_container_t& graph_container,
                               size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
-                              size_t num_local_unique_edge_rows,
-                              size_t num_local_unique_edge_cols,
                               bool is_weighted,
                               bool is_symmetric,
                               bool transposed,
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 226f9891340..aba35ceea0b 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -169,7 +169,10 @@ std::unique_ptr<graph_t<vertex_t, edge_t, weight_t, transposed, multi_gpu>> crea
             static_cast<vertex_t const*>(graph_container.segment_offsets),
             static_cast<vertex_t const*>(graph_container.segment_offsets) +
               graph_container.num_segments + 1)
-        : std::nullopt},
+        : std::nullopt,
+      // FIXME: disable (key, value) pairs at this moment (should be enabled once fully tuned).
+      std::numeric_limits<vertex_t>::max(),
+      std::numeric_limits<vertex_t>::max()},
     graph_container.do_expensive_check);
 }
 
@@ -223,8 +226,6 @@ void populate_graph_container(graph_container_t& graph_container,
                               size_t num_local_edges,
                               size_t num_global_vertices,
                               size_t num_global_edges,
-                              size_t num_local_unique_edge_rows,
-                              size_t num_local_unique_edge_cols,
                               bool is_weighted,
                               bool is_symmetric,
                               bool transposed,
@@ -248,24 +249,22 @@ void populate_graph_container(graph_container_t& graph_container,
     graph_container.col_comm_rank = col_comm_rank;
   }
 
-  graph_container.src_vertices               = src_vertices;
-  graph_container.dst_vertices               = dst_vertices;
-  graph_container.weights                    = weights;
-  graph_container.is_weighted                = is_weighted;
-  graph_container.vertex_partition_offsets   = vertex_partition_offsets;
-  graph_container.segment_offsets            = segment_offsets;
-  graph_container.num_segments               = num_segments;
-  graph_container.num_local_edges            = num_local_edges;
-  graph_container.num_global_vertices        = num_global_vertices;
-  graph_container.num_global_edges           = num_global_edges;
-  graph_container.num_local_unique_edge_rows = num_local_unique_edge_rows;
-  graph_container.num_local_unique_edge_cols = num_local_unique_edge_cols;
-  graph_container.vertexType                 = vertexType;
-  graph_container.edgeType                   = edgeType;
-  graph_container.weightType                 = weightType;
-  graph_container.transposed                 = transposed;
-  graph_container.is_multi_gpu               = multi_gpu;
-  graph_container.do_expensive_check         = do_expensive_check;
+  graph_container.src_vertices             = src_vertices;
+  graph_container.dst_vertices             = dst_vertices;
+  graph_container.weights                  = weights;
+  graph_container.is_weighted              = is_weighted;
+  graph_container.vertex_partition_offsets = vertex_partition_offsets;
+  graph_container.segment_offsets          = segment_offsets;
+  graph_container.num_segments             = num_segments;
+  graph_container.num_local_edges          = num_local_edges;
+  graph_container.num_global_vertices      = num_global_vertices;
+  graph_container.num_global_edges         = num_global_edges;
+  graph_container.vertexType               = vertexType;
+  graph_container.edgeType                 = edgeType;
+  graph_container.weightType               = weightType;
+  graph_container.transposed               = transposed;
+  graph_container.is_multi_gpu             = multi_gpu;
+  graph_container.do_expensive_check       = do_expensive_check;
 
   graph_properties_t graph_props{.is_symmetric = is_symmetric, .is_multigraph = false};
   graph_container.graph_props = graph_props;