From 37b73104e553bc13f3247b437e548dd89f45a4cb Mon Sep 17 00:00:00 2001
From: Seunghwa Kang <45857425+seunghwak@users.noreply.github.com>
Date: Tue, 8 Feb 2022 07:19:07 -0800
Subject: [PATCH] MG Louvain C++ test R-mat usecase parameters (#2061)

R-mat usecase parameters weren't set for multi-GPU tests. Fixed in this PR.

Authors:
  - Seunghwa Kang (https://github.com/seunghwak)

Approvers:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Kumar Aatish (https://github.com/kaatish)

URL: https://github.com/rapidsai/cugraph/pull/2061
---
 cpp/src/community/louvain.cuh           |  6 +++-
 cpp/tests/community/mg_louvain_test.cpp | 45 ++++++++++++++++++++-----
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/cpp/src/community/louvain.cuh b/cpp/src/community/louvain.cuh
index da9425e0a4b..025c520abf5 100644
--- a/cpp/src/community/louvain.cuh
+++ b/cpp/src/community/louvain.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -478,6 +478,10 @@ class Louvain {
                              thrust::make_zip_iterator(thrust::make_tuple(
                                old_cluster_sum_v.begin(), cluster_subtract_v.begin())),
                              src_old_cluster_sum_subtract_pairs);
+      old_cluster_sum_v.resize(0, handle_.get_stream());
+      old_cluster_sum_v.shrink_to_fit(handle_.get_stream());
+      cluster_subtract_v.resize(0, handle_.get_stream());
+      cluster_subtract_v.shrink_to_fit(handle_.get_stream());
     }
 
     auto output_buffer = allocate_dataframe_buffer<thrust::tuple<vertex_t, weight_t>>(
diff --git a/cpp/tests/community/mg_louvain_test.cpp b/cpp/tests/community/mg_louvain_test.cpp
index feb0b91d06a..662d0592ec2 100644
--- a/cpp/tests/community/mg_louvain_test.cpp
+++ b/cpp/tests/community/mg_louvain_test.cpp
@@ -18,6 +18,7 @@
 
 #include <utilities/base_fixture.hpp>
 #include <utilities/device_comm_wrapper.hpp>
+#include <utilities/high_res_clock.h>
 #include <utilities/test_utilities.hpp>
 
 #include <cugraph/algorithms.hpp>
@@ -154,11 +155,11 @@ class Tests_MG_Louvain
   {
     auto [louvain_usecase, input_usecase] = param;
 
-    raft::handle_t handle;
+    raft::handle_t handle{};
+    HighResClock hr_clock{};
 
     raft::comms::initialize_mpi_comms(&handle, MPI_COMM_WORLD);
-    const auto& comm = handle.get_comms();
-
+    const auto& comm     = handle.get_comms();
     auto const comm_size = comm.get_size();
     auto const comm_rank = comm.get_rank();
 
@@ -166,23 +167,47 @@ class Tests_MG_Louvain
     while (comm_size % row_comm_size != 0) {
       --row_comm_size;
     }
+
     cugraph::partition_2d::subcomm_factory_t<cugraph::partition_2d::key_naming_t, vertex_t>
       subcomm_factory(handle, row_comm_size);
 
-    cudaStream_t stream = handle.get_stream();
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
 
     auto [mg_graph, d_renumber_map_labels] =
       cugraph::test::construct_graph<vertex_t, edge_t, weight_t, false, true>(
         handle, input_usecase, true, true);
 
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG construct_graph took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
     auto mg_graph_view = mg_graph.view();
 
-    std::unique_ptr<cugraph::Dendrogram<vertex_t>> dendrogram;
-    weight_t mg_modularity;
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      hr_clock.start();
+    }
 
-    std::tie(dendrogram, mg_modularity) = cugraph::louvain(
+    auto [dendrogram, mg_modularity] = cugraph::louvain(
       handle, mg_graph_view, louvain_usecase.max_level_, louvain_usecase.resolution_);
 
+    if (cugraph::test::g_perf) {
+      RAFT_CUDA_TRY(cudaDeviceSynchronize());  // for consistent performance measurement
+      handle.get_comms().barrier();
+      double elapsed_time{0.0};
+      hr_clock.stop(&elapsed_time);
+      std::cout << "MG Louvain took " << elapsed_time * 1e-6 << " s.\n";
+    }
+
     if (louvain_usecase.check_correctness_) {
       SCOPED_TRACE("compare modularity input");
 
@@ -281,7 +306,8 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // disable correctness checks for large graphs
     ::testing::Values(Louvain_Usecase{}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+    ::testing::Values(
+      cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
 
 INSTANTIATE_TEST_SUITE_P(
   rmat64_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with
@@ -293,6 +319,7 @@ INSTANTIATE_TEST_SUITE_P(
   ::testing::Combine(
     // disable correctness checks for large graphs
     ::testing::Values(Louvain_Usecase{}),
-    ::testing::Values(cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false))));
+    ::testing::Values(
+      cugraph::test::Rmat_Usecase(20, 32, 0.57, 0.19, 0.19, 0, true, false, 0, true))));
 
 CUGRAPH_MG_TEST_PROGRAM_MAIN()