Update graph partitioning scheme (#1443)

Partially addresses Issue #1442 Update graph partitioning scheme to better control memory footprint vs concurrency trade-offs for large-scale graph processing in large clusters. This new partitioning scheme also simplifies communication patterns among GPUs which can potentially improve scalability. Authors: - Seunghwa Kang (https://github.com/seunghwak) Approvers: - Chuck Hastings (https://github.com/ChuckHastings) - Alex Fender (https://github.com/afender) - Andrei Schaffer (https://github.com/aschaffer) URL: #1443
rapidsai · Apr 6, 2021 · 9a1ab09 · 9a1ab09
1 parent a6edf62
commit 9a1ab09
Show file tree

Hide file tree

Showing 64 changed files with 5,481 additions and 2,841 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -432,6 +432,7 @@ add_library(cugraph SHARED
     src/experimental/graph_view.cu
     src/experimental/coarsen_graph.cu
     src/experimental/renumber_edgelist.cu
+    src/experimental/renumber_utils.cu
     src/experimental/relabel.cu
     src/experimental/induced_subgraph.cu
     src/experimental/bfs.cu

diff --git a/cpp/include/dendrogram.hpp b/cpp/include/dendrogram.hpp
@@ -27,7 +27,7 @@ class Dendrogram {
  public:
   void add_level(vertex_t first_index,
                  vertex_t num_verts,
-                 cudaStream_t stream                 = 0,
+                 cudaStream_t stream,
                  rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
   {
     level_ptr_.push_back(std::make_unique<rmm::device_uvector<vertex_t>>(num_verts, stream, mr));

diff --git a/cpp/include/experimental/detail/graph_utils.cuh b/cpp/include/experimental/detail/graph_utils.cuh
@@ -56,65 +56,32 @@ rmm::device_uvector<edge_t> compute_major_degrees(
   rmm::device_uvector<edge_t> degrees(0, handle.get_stream());
 
   vertex_t max_num_local_degrees{0};
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx  = static_cast<size_t>(i * row_comm_size + row_comm_rank);
     auto vertex_partition_size = partition.get_vertex_partition_size(vertex_partition_idx);
     max_num_local_degrees      = std::max(max_num_local_degrees, vertex_partition_size);
-    if (i == (partition.is_hypergraph_partitioned() ? col_comm_rank : row_comm_rank)) {
-      degrees.resize(vertex_partition_size, handle.get_stream());
-    }
+    if (i == col_comm_rank) { degrees.resize(vertex_partition_size, handle.get_stream()); }
   }
   local_degrees.resize(max_num_local_degrees, handle.get_stream());
-  for (int i = 0; i < (partition.is_hypergraph_partitioned() ? col_comm_size : row_comm_size);
-       ++i) {
-    auto vertex_partition_idx = partition.is_hypergraph_partitioned()
-                                  ? static_cast<size_t>(i * row_comm_size + row_comm_rank)
-                                  : static_cast<size_t>(col_comm_rank * row_comm_size + i);
+  for (int i = 0; i < col_comm_size; ++i) {
+    auto vertex_partition_idx = static_cast<size_t>(i * row_comm_size + row_comm_rank);
     vertex_t major_first{};
     vertex_t major_last{};
     std::tie(major_first, major_last) = partition.get_vertex_partition_range(vertex_partition_idx);
-    auto p_offsets =
-      partition.is_hypergraph_partitioned()
-        ? adj_matrix_partition_offsets[i]
-        : adj_matrix_partition_offsets[0] +
-            (major_first - partition.get_vertex_partition_first(col_comm_rank * row_comm_size));
+    auto p_offsets                    = adj_matrix_partition_offsets[i];
     thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
                       thrust::make_counting_iterator(vertex_t{0}),
                       thrust::make_counting_iterator(major_last - major_first),
                       local_degrees.data(),
                       [p_offsets] __device__(auto i) { return p_offsets[i + 1] - p_offsets[i]; });
-    if (partition.is_hypergraph_partitioned()) {
-      col_comm.reduce(local_degrees.data(),
-                      i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    } else {
-      row_comm.reduce(local_degrees.data(),
-                      i == row_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
-                      static_cast<size_t>(major_last - major_first),
-                      raft::comms::op_t::SUM,
-                      i,
-                      handle.get_stream());
-    }
+    col_comm.reduce(local_degrees.data(),
+                    i == col_comm_rank ? degrees.data() : static_cast<edge_t *>(nullptr),
+                    static_cast<size_t>(major_last - major_first),
+                    raft::comms::op_t::SUM,
+                    i,
+                    handle.get_stream());
   }
 
-  raft::comms::status_t status{};
-  if (partition.is_hypergraph_partitioned()) {
-    status =
-      col_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  } else {
-    status =
-      row_comm.sync_stream(handle.get_stream());  // this is neessary as local_degrees will become
-                                                  // out-of-scope once this function returns.
-  }
-  CUGRAPH_EXPECTS(status == raft::comms::status_t::SUCCESS, "sync_stream() failure.");
-
   return degrees;
 }
 
@@ -170,7 +137,6 @@ struct compute_gpu_id_from_vertex_t {
 
 template <typename vertex_t>
 struct compute_gpu_id_from_edge_t {
-  bool hypergraph_partitioned{false};
   int comm_size{0};
   int row_comm_size{0};
   int col_comm_size{0};
@@ -180,12 +146,22 @@ struct compute_gpu_id_from_edge_t {
     cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
     auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
     auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
-    if (hypergraph_partitioned) {
-      return (minor_comm_rank / col_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
-    } else {
-      return (major_comm_rank - (major_comm_rank % row_comm_size)) +
-             (minor_comm_rank / col_comm_size);
-    }
+    return (minor_comm_rank / row_comm_size) * row_comm_size + (major_comm_rank % row_comm_size);
+  }
+};
+
+template <typename vertex_t>
+struct compute_partition_id_from_edge_t {
+  int comm_size{0};
+  int row_comm_size{0};
+  int col_comm_size{0};
+
+  __device__ int operator()(vertex_t major, vertex_t minor) const
+  {
+    cuco::detail::MurmurHash3_32<vertex_t> hash_func{};
+    auto major_comm_rank = static_cast<int>(hash_func(major) % comm_size);
+    auto minor_comm_rank = static_cast<int>(hash_func(minor) % comm_size);
+    return major_comm_rank * col_comm_size + minor_comm_rank / row_comm_size;
   }
 };
 

diff --git a/cpp/include/experimental/graph.hpp b/cpp/include/experimental/graph.hpp
@@ -188,6 +188,20 @@ template <typename edge_t>
 struct invalid_edge_id : invalid_idx<edge_t> {
 };
 
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_signed<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return (v >= 0) && (v < num_vertices);
+}
+
+template <typename vertex_t>
+__host__ __device__ std::enable_if_t<std::is_unsigned<vertex_t>::value, bool> is_valid_vertex(
+  vertex_t num_vertices, vertex_t v)
+{
+  return v < num_vertices;
+}
+
 }  // namespace experimental
 }  // namespace cugraph