From 8b1004e594d1ecb532003d1a971cc1c31e42b932 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <37386037+aschaffer@users.noreply.github.com>
Date: Wed, 28 Apr 2021 15:56:27 -0500
Subject: [PATCH] Added Random Walks COO convertor and profiling (#1531)

This PR is used to track enhancements to Random Walks functionality:
1. Paths2COO convertor: converts coalesced vertex/weight paths to COO format + offsets (including C++ API for Cython);
2. RW profiling;
3. Moving functionality / tests out of `experimental` sub-dirs;

Authors:
  - Andrei Schaffer (https://github.com/aschaffer)
  - Rick Ratzel (https://github.com/rlratzel)

Approvers:
  - Rick Ratzel (https://github.com/rlratzel)
  - Chuck Hastings (https://github.com/ChuckHastings)

URL: https://github.com/rapidsai/cugraph/pull/1531
---
 cpp/include/algorithms.hpp                    |   1 +
 cpp/include/utilities/cython.hpp              |  26 +++
 cpp/include/utilities/path_retrieval.hpp      |  26 +++
 cpp/src/sampling/random_walks.cu              |  26 ++-
 .../random_walks.cuh                          | 193 ++++++++++++++++
 cpp/src/utilities/cython.cu                   |  31 +++
 cpp/src/utilities/high_res_timer.hpp          |  15 ++
 cpp/tests/CMakeLists.txt                      |  21 +-
 cpp/tests/sampling/random_walks_profiling.cu  | 216 ++++++++++++++++++
 .../random_walks_test.cu                      |   2 +-
 .../random_walks_utils.cuh                    |   2 +-
 .../rw_low_level_test.cu                      | 120 +++++++++-
 cpp/tests/utilities/base_fixture.hpp          |   2 +-
 13 files changed, 670 insertions(+), 11 deletions(-)
 rename cpp/src/{experimental => sampling}/random_walks.cuh (82%)
 create mode 100644 cpp/tests/sampling/random_walks_profiling.cu
 rename cpp/tests/{experimental => sampling}/random_walks_test.cu (99%)
 rename cpp/tests/{experimental => sampling}/random_walks_utils.cuh (99%)
 rename cpp/tests/{experimental => sampling}/rw_low_level_test.cu (86%)
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index 0b45b799357..7a7a0219d74 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -1280,5 +1280,6 @@ random_walks(raft::handle_t const &handle,
              typename graph_t::vertex_type const *ptr_d_start,
              index_t num_paths,
              index_t max_depth);
+
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
index c1d0c836225..0d6cb2f63d0 100644
--- a/cpp/include/utilities/cython.hpp
+++ b/cpp/include/utilities/cython.hpp
@@ -207,6 +207,26 @@ struct random_walk_ret_t {
   std::unique_ptr<rmm::device_buffer> d_sizes_;
 };
 
+// aggregate for random_walks() COO return type
+// to be exposed to cython:
+//
+struct random_walk_coo_t {
+  size_t num_edges_;    // total number of COO triplets (for all paths)
+  size_t num_offsets_;  // offsets of where each COO set starts for each path;
+                        // NOTE: this can differ than num_paths_,
+                        // because paths with 0 edges (one vertex)
+                        // don't participate to the COO
+
+  std::unique_ptr<rmm::device_buffer>
+    d_src_;  // coalesced set of COO source vertices; |d_src_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_dst_;  // coalesced set of COO destination vertices; |d_dst_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_weights_;  // coalesced set of COO edge weights; |d_weights_| = num_edges_
+  std::unique_ptr<rmm::device_buffer>
+    d_offsets_;  // offsets where each COO subset for each path starts; |d_offsets_| = num_offsets_
+};
+
 // wrapper for renumber_edgelist() return
 // (unrenumbering maps, etc.)
 //
@@ -479,6 +499,12 @@ call_random_walks(raft::handle_t const& handle,
                   edge_t num_paths,
                   edge_t max_depth);
 
+// convertor from random_walks return type to COO:
+//
+template <typename vertex_t, typename index_t>
+std::unique_ptr<random_walk_coo_t> random_walks_to_coo(raft::handle_t const& handle,
+                                                       random_walk_ret_t& rw_ret);
+
 // wrapper for shuffling:
 //
 template <typename vertex_t, typename edge_t, typename weight_t>
diff --git a/cpp/include/utilities/path_retrieval.hpp b/cpp/include/utilities/path_retrieval.hpp
index e626d6af1ab..fd0d36b67d6 100644
--- a/cpp/include/utilities/path_retrieval.hpp
+++ b/cpp/include/utilities/path_retrieval.hpp
@@ -42,4 +42,30 @@ void get_traversed_cost(raft::handle_t const &handle,
                         weight_t *out,
                         vertex_t stop_vertex,
                         vertex_t num_vertices);
+
+namespace experimental {
+/**
+ * @brief returns the COO format (src_vector, dst_vector) from the random walks (RW)
+ * paths.
+ *
+ * @tparam vertex_t Type of vertex indices.
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param coalesced_sz_v coalesced vertex vector size.
+ * @param num_paths number of paths.
+ * @param d_coalesced_v coalesced vertex buffer.
+ * @param d_sizes paths size buffer.
+ * @return tuple of (src_vertex_vector, dst_Vertex_vector, path_offsets), where
+ * path_offsets are the offsets where the COO set of each path starts.
+ */
+template <typename vertex_t, typename index_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<index_t>>
+  convert_paths_to_coo(raft::handle_t const &handle,
+                       index_t coalesced_sz_v,
+                       index_t num_paths,
+                       rmm::device_buffer &&d_coalesced_v,
+                       rmm::device_buffer &&d_sizes);
+}  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cu b/cpp/src/sampling/random_walks.cu
index 88d5d9ed5c8..d1d0382d46f 100644
--- a/cpp/src/sampling/random_walks.cu
+++ b/cpp/src/sampling/random_walks.cu
@@ -17,7 +17,7 @@
 // Andrei Schaffer, aschaffer@nvidia.com
 //
 #include <algorithms.hpp>
-#include <experimental/random_walks.cuh>
+#include "random_walks.cuh"
 
 namespace cugraph {
 namespace experimental {
@@ -73,6 +73,30 @@ template std::
                int64_t const* ptr_d_start,
                int64_t num_paths,
                int64_t max_depth);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int32_t coalesced_sz_v,
+                       int32_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<int32_t>, rmm::device_uvector<int64_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int64_t coalesced_sz_v,
+                       int64_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>, rmm::device_uvector<int64_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       int64_t coalesced_sz_v,
+                       int64_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes);
 //}
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/experimental/random_walks.cuh b/cpp/src/sampling/random_walks.cuh
similarity index 82%
rename from cpp/src/experimental/random_walks.cuh
rename to cpp/src/sampling/random_walks.cuh
index aea8f3d8420..82665003769 100644
--- a/cpp/src/experimental/random_walks.cuh
+++ b/cpp/src/sampling/random_walks.cuh
@@ -40,6 +40,7 @@
 #include <thrust/logical.h>
 #include <thrust/remove.h>
 #include <thrust/transform.h>
+#include <thrust/transform_scan.h>
 #include <thrust/tuple.h>
 
 #include <cassert>
@@ -103,6 +104,12 @@ struct device_const_vector_view {
   index_t size_;
 };
 
+template <typename value_t>
+value_t const* raw_const_ptr(device_const_vector_view<value_t>& dv)
+{
+  return dv.begin();
+}
+
 // raft random generator:
 // (using upper-bound cached "map"
 //  giving out_deg(v) for each v in [0, |V|);
@@ -840,6 +847,156 @@ random_walks_impl(raft::handle_t const& handle,
   CUGRAPH_FAIL("Not implemented yet.");
 }
 
+// provides conversion to (coalesced) path to COO format:
+// (which in turn provides an API consistent with egonet)
+//
+template <typename vertex_t, typename index_t>
+struct coo_convertor_t {
+  coo_convertor_t(raft::handle_t const& handle, index_t num_paths)
+    : handle_(handle), num_paths_(num_paths)
+  {
+  }
+
+  std::tuple<device_vec_t<vertex_t>, device_vec_t<vertex_t>, device_vec_t<index_t>> operator()(
+    device_const_vector_view<vertex_t>& d_coalesced_v,
+    device_const_vector_view<index_t>& d_sizes) const
+  {
+    CUGRAPH_EXPECTS(static_cast<index_t>(d_sizes.size()) == num_paths_, "Invalid size vector.");
+
+    auto tupl_fill        = fill_stencil(d_sizes);
+    auto&& d_stencil      = std::move(std::get<0>(tupl_fill));
+    auto total_sz_v       = std::get<1>(tupl_fill);
+    auto&& d_sz_incl_scan = std::move(std::get<2>(tupl_fill));
+
+    CUGRAPH_EXPECTS(static_cast<index_t>(d_coalesced_v.size()) == total_sz_v,
+                    "Inconsistent vertex coalesced size data.");
+
+    auto src_dst_tpl = gather_pairs(d_coalesced_v, d_stencil, total_sz_v);
+
+    auto&& d_src = std::move(std::get<0>(src_dst_tpl));
+    auto&& d_dst = std::move(std::get<1>(src_dst_tpl));
+
+    device_vec_t<index_t> d_sz_w_scan(num_paths_, handle_.get_stream());
+
+    // copy vertex path sizes that are > 1:
+    // (because vertex_path_sz translates
+    //  into edge_path_sz = vertex_path_sz - 1,
+    //  and edge_paths_sz == 0 don't contribute
+    //  anything):
+    //
+    auto new_end_it =
+      thrust::copy_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                      d_sizes.begin(),
+                      d_sizes.end(),
+                      d_sz_w_scan.begin(),
+                      [] __device__(auto sz_value) { return sz_value > 1; });
+
+    // resize to new_end:
+    //
+    d_sz_w_scan.resize(thrust::distance(d_sz_w_scan.begin(), new_end_it), handle_.get_stream());
+
+    // get paths' edge number exclusive scan
+    // by transforming paths' vertex numbers that
+    // are > 1, via tranaformation:
+    // edge_path_sz = (vertex_path_sz-1):
+    //
+    thrust::transform_exclusive_scan(
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+      d_sz_w_scan.begin(),
+      d_sz_w_scan.end(),
+      d_sz_w_scan.begin(),
+      [] __device__(auto sz) { return sz - 1; },
+      index_t{0},
+      thrust::plus<index_t>{});
+
+    return std::make_tuple(std::move(d_src), std::move(d_dst), std::move(d_sz_w_scan));
+  }
+
+  std::tuple<device_vec_t<int>, index_t, device_vec_t<index_t>> fill_stencil(
+    device_const_vector_view<index_t>& d_sizes) const
+  {
+    device_vec_t<index_t> d_scan(num_paths_, handle_.get_stream());
+    thrust::inclusive_scan(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                           d_sizes.begin(),
+                           d_sizes.end(),
+                           d_scan.begin());
+
+    index_t total_sz{0};
+    CUDA_TRY(cudaMemcpy(
+      &total_sz, raw_ptr(d_scan) + num_paths_ - 1, sizeof(index_t), cudaMemcpyDeviceToHost));
+
+    device_vec_t<int> d_stencil(total_sz, handle_.get_stream());
+
+    // initialize stencil to all 1's:
+    //
+    thrust::copy_n(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                   thrust::make_constant_iterator<int>(1),
+                   d_stencil.size(),
+                   d_stencil.begin());
+
+    // set to 0 entries positioned at inclusive_scan(sizes[]),
+    // because those are path "breakpoints", where a path end
+    // and the next one starts, hence there cannot be an edge
+    // between a path ending vertex and next path starting vertex;
+    //
+    thrust::scatter(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                    thrust::make_constant_iterator(0),
+                    thrust::make_constant_iterator(0) + num_paths_,
+                    d_scan.begin(),
+                    d_stencil.begin());
+
+    return std::make_tuple(std::move(d_stencil), total_sz, std::move(d_scan));
+  }
+
+  std::tuple<device_vec_t<vertex_t>, device_vec_t<vertex_t>> gather_pairs(
+    device_const_vector_view<vertex_t>& d_coalesced_v,
+    device_vec_t<int> const& d_stencil,
+    index_t total_sz_v) const
+  {
+    auto total_sz_w = total_sz_v - num_paths_;
+    device_vec_t<index_t> valid_src_indx(total_sz_w, handle_.get_stream());
+
+    // generate valid vertex src indices,
+    // which is any index in {0,...,total_sz_v - 2}
+    // provided the next index position; i.e., (index+1),
+    // in stencil is not 0; (if it is, there's no "next"
+    // or dst index, because the path has ended);
+    //
+    thrust::copy_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(total_sz_v - 1),
+                    valid_src_indx.begin(),
+                    [ptr_d_stencil = raw_const_ptr(d_stencil)] __device__(auto indx) {
+                      auto dst_indx = indx + 1;
+                      return ptr_d_stencil[dst_indx] == 1;
+                    });
+
+    device_vec_t<vertex_t> d_src_v(total_sz_w, handle_.get_stream());
+    device_vec_t<vertex_t> d_dst_v(total_sz_w, handle_.get_stream());
+
+    // construct pair of src[], dst[] by gathering
+    // from d_coalesced_v all pairs
+    // at entries (valid_src_indx, valid_src_indx+1),
+    // where the set of valid_src_indx was
+    // generated at the previous step;
+    //
+    thrust::transform(
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+      valid_src_indx.begin(),
+      valid_src_indx.end(),
+      thrust::make_zip_iterator(thrust::make_tuple(d_src_v.begin(), d_dst_v.begin())),  // start_zip
+      [ptr_d_vertex = raw_const_ptr(d_coalesced_v)] __device__(auto indx) {
+        return thrust::make_tuple(ptr_d_vertex[indx], ptr_d_vertex[indx + 1]);
+      });
+
+    return std::make_tuple(std::move(d_src_v), std::move(d_dst_v));
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+};
+
 }  // namespace detail
 
 /**
@@ -883,5 +1040,41 @@ random_walks(raft::handle_t const& handle,
                          std::move(std::get<1>(quad_tuple)),
                          std::move(std::get<2>(quad_tuple)));
 }
+
+/**
+ * @brief returns the COO format (src_vector, dst_vector) from the random walks (RW)
+ * paths.
+ *
+ * @tparam vertex_t Type of vertex indices.
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param coalesced_sz_v coalesced vertex vector size.
+ * @param num_paths number of paths.
+ * @param d_coalesced_v coalesced vertex buffer.
+ * @param d_sizes paths size buffer.
+ * @return tuple of (src_vertex_vector, dst_Vertex_vector, path_offsets), where
+ * path_offsets are the offsets where the COO set of each path starts.
+ */
+template <typename vertex_t, typename index_t>
+std::
+  tuple<rmm::device_uvector<vertex_t>, rmm::device_uvector<vertex_t>, rmm::device_uvector<index_t>>
+  convert_paths_to_coo(raft::handle_t const& handle,
+                       index_t coalesced_sz_v,
+                       index_t num_paths,
+                       rmm::device_buffer&& d_coalesced_v,
+                       rmm::device_buffer&& d_sizes)
+{
+  detail::coo_convertor_t<vertex_t, index_t> to_coo(handle, num_paths);
+
+  detail::device_const_vector_view<vertex_t> d_v_view(
+    static_cast<vertex_t const*>(d_coalesced_v.data()), coalesced_sz_v);
+
+  detail::device_const_vector_view<index_t> d_sz_view(static_cast<index_t const*>(d_sizes.data()),
+                                                      num_paths);
+
+  return to_coo(d_v_view, d_sz_view);
+}
+
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 9729039fd48..b4dcd84a7e1 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -22,6 +22,7 @@
 #include <partition_manager.hpp>
 #include <utilities/cython.hpp>
 #include <utilities/error.hpp>
+#include <utilities/path_retrieval.hpp>
 #include <utilities/shuffle_comm.cuh>
 
 #include <rmm/thrust_rmm_allocator.h>
@@ -844,6 +845,27 @@ call_random_walks(raft::handle_t const& handle,
   }
 }
 
+template <typename vertex_t, typename index_t>
+std::unique_ptr<random_walk_coo_t> random_walks_to_coo(raft::handle_t const& handle,
+                                                       random_walk_ret_t& rw_tri)
+{
+  auto triplet = cugraph::experimental::convert_paths_to_coo<vertex_t, index_t>(
+    handle,
+    static_cast<index_t>(rw_tri.coalesced_sz_v_),
+    static_cast<index_t>(rw_tri.num_paths_),
+    std::move(*rw_tri.d_coalesced_v_),
+    std::move(*rw_tri.d_sizes_));
+
+  random_walk_coo_t rw_coo{std::get<0>(triplet).size(),
+                           std::get<2>(triplet).size(),
+                           std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                           std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                           std::move(rw_tri.d_coalesced_w_),  // pass-through
+                           std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+  return std::make_unique<random_walk_coo_t>(std::move(rw_coo));
+}
+
 // Wrapper for calling SSSP through a graph container
 template <typename vertex_t, typename weight_t>
 void call_sssp(raft::handle_t const& handle,
@@ -1233,6 +1255,15 @@ template std::unique_ptr<random_walk_ret_t> call_random_walks<int64_t, int64_t>(
   int64_t num_paths,
   int64_t max_depth);
 
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int32_t, int32_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
+
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int32_t, int64_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
+
+template std::unique_ptr<random_walk_coo_t> random_walks_to_coo<int64_t, int64_t>(
+  raft::handle_t const& handle, random_walk_ret_t& rw_tri);
+
 template void call_sssp(raft::handle_t const& handle,
                         graph_container_t const& graph_container,
                         int32_t* identifiers,
diff --git a/cpp/src/utilities/high_res_timer.hpp b/cpp/src/utilities/high_res_timer.hpp
index a731c5edc9d..807496c8f86 100644
--- a/cpp/src/utilities/high_res_timer.hpp
+++ b/cpp/src/utilities/high_res_timer.hpp
@@ -18,6 +18,8 @@
 #include <ctime>
 #include <iostream>
 #include <map>
+#include <sstream>
+#include <stdexcept>
 #include <string>
 
 //#define TIMING
@@ -52,6 +54,19 @@ class HighResTimer {
     it->second.second += stop_time.tv_sec * 1000000000 + stop_time.tv_nsec;
   }
 
+  double get_average_runtime(std::string const &label)
+  {
+    auto it = timers.find(label);
+    if (it != timers.end()) {
+      return (static_cast<double>(it->second.second) / (1000000.0 * it->second.first));
+    } else {
+      std::stringstream ss;
+      ss << "ERROR: timing label: " << label << "not found.";
+
+      throw std::runtime_error(ss.str());
+    }
+  }
+
   //
   //  Add display functions... specific label or entire structure
   //
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7a544fd75fb..80484fdfad6 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -539,16 +539,25 @@ ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_
 ###################################################################################################
 # - Experimental RANDOM_WALKS tests ------------------------------------------------------------
 
-set(EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/random_walks_test.cu")
+set(RANDOM_WALKS_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/sampling/random_walks_test.cu")
 
-ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_TEST "${EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS}")
+ConfigureTest(RANDOM_WALKS_TEST "${RANDOM_WALKS_TEST_SRCS}")
 
 ###################################################################################################
-set(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS
-    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/rw_low_level_test.cu")
+set(RANDOM_WALKS_LOW_LEVEL_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/sampling/rw_low_level_test.cu")
 
-ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_TEST "${EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS}")
+ConfigureTest(RANDOM_WALKS_LOW_LEVEL_TEST "${RANDOM_WALKS_LOW_LEVEL_SRCS}")
+
+###################################################################################################
+set(RANDOM_WALKS_PROFILING_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/sampling/random_walks_profiling.cu")
+
+# FIXME: since this is technically not a test, consider refactoring the the
+# ConfigureTest function to share common code with a new ConfigureBenchmark
+# function (which would not link gtest, etc.)
+ConfigureTest(RANDOM_WALKS_PROFILING "${RANDOM_WALKS_PROFILING_SRCS}")
 
 
 ###################################################################################################
diff --git a/cpp/tests/sampling/random_walks_profiling.cu b/cpp/tests/sampling/random_walks_profiling.cu
new file mode 100644
index 00000000000..397196c4c78
--- /dev/null
+++ b/cpp/tests/sampling/random_walks_profiling.cu
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utilities/base_fixture.hpp>  // cugraph::test::create_memory_resource()
+#include <utilities/high_res_timer.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <algorithms.hpp>
+#include <graph.hpp>
+#include <sampling/random_walks.cuh>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+
+#include <cuda_profiler_api.h>
+#include <thrust/random.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+/**
+ * @internal
+ * @brief Populates the device vector d_start with the starting vertex indices
+ * to be used for each RW path specified.
+ */
+template <typename vertex_t, typename index_t>
+void fill_start(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>& d_start,
+                index_t num_vertices)
+{
+  index_t num_paths = d_start.size();
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(num_paths),
+
+                    d_start.begin(),
+                    [num_vertices] __device__(auto indx) { return indx % num_vertices; });
+}
+
+/**
+ * @internal
+ * @brief Calls the random_walks algorithm and displays the time metrics (total
+ * time for all requested paths, average time for each path).
+ */
+template <typename graph_vt>
+void output_random_walks_time(graph_vt const& graph_view, typename graph_vt::edge_type num_paths)
+{
+  using vertex_t = typename graph_vt::vertex_type;
+  using edge_t   = typename graph_vt::edge_type;
+  using weight_t = typename graph_vt::weight_type;
+
+  raft::handle_t handle{};
+  rmm::device_uvector<vertex_t> d_start(num_paths, handle.get_stream());
+
+  vertex_t num_vertices = graph_view.get_number_of_vertices();
+  fill_start(handle, d_start, num_vertices);
+
+  // 0-copy const device view:
+  //
+  cugraph::experimental::detail::device_const_vector_view<vertex_t, edge_t> d_start_view{
+    d_start.data(), num_paths};
+
+  edge_t max_depth{10};
+
+  HighResTimer hr_timer;
+  std::string label("RandomWalks");
+  hr_timer.start(label);
+  cudaProfilerStart();
+  auto ret_tuple =
+    cugraph::experimental::detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+  cudaProfilerStop();
+  hr_timer.stop();
+  try {
+    auto runtime = hr_timer.get_average_runtime(label);
+
+    std::cout << "RW for num_paths: " << num_paths
+              << ", runtime [ms] / path: " << runtime / num_paths << ":\n";
+
+  } catch (std::exception const& ex) {
+    std::cerr << ex.what() << '\n';
+    return;
+
+  } catch (...) {
+    std::cerr << "ERROR: Unknown exception on timer label search." << '\n';
+    return;
+  }
+  hr_timer.display(std::cout);
+}
+
+/**
+ * @struct RandomWalks_Usecase
+ * @brief Used to specify input to a random_walks benchmark/profile run
+ *
+ * @var RandomWalks_Usecase::graph_file_full_path  Computed during construction
+ * to be an absolute path consisting of the value of the RAPIDS_DATASET_ROOT_DIR
+ * env var and the graph_file_path constructor arg. This is initialized to an
+ * empty string.
+ *
+ * @var RandomWalks_Usecase::test_weighted Bool representing if the specified
+ * graph is weighted or not. This is initialized to false (unweighted).
+ */
+struct RandomWalks_Usecase {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  RandomWalks_Usecase(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+/**
+ * @brief Runs random_walks on a specified input and outputs time metrics
+ *
+ * Creates a graph_t instance from the configuration specified in the
+ * RandomWalks_Usecase instance passed in (currently by reading a dataset to
+ * populate the graph_t), then runs random_walks to generate 1, 10, and 100
+ * random paths and output statistics for each.
+ *
+ * @tparam vertex_t          Type of vertex identifiers.
+ * @tparam edge_t            Type of edge identifiers.
+ * @tparam weight_t          Type of weight identifiers.
+ *
+ * @param[in] configuration RandomWalks_Usecase instance containing the input
+ * file to read for constructing the graph_t.
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+void run(RandomWalks_Usecase const& configuration)
+{
+  raft::handle_t handle{};
+
+  cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+  std::tie(graph, std::ignore) =
+    cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+      handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+
+  auto graph_view = graph.view();
+
+  // FIXME: the num_paths vector might be better specified via the
+  // configuration input instead of hardcoding here.
+  std::vector<edge_t> v_np{1, 10, 100};
+  for (auto&& num_paths : v_np) { output_random_walks_time(graph_view, num_paths); }
+}
+
+/**
+ * @brief Performs the random_walks benchmark/profiling run
+ *
+ * main function for performing the random_walks benchmark/profiling run. The
+ * resulting executable takes the following options: "rmm_mode" which can be one
+ * of "binning", "cuda", "pool", or "managed.  "dataset" which is a path
+ * relative to the env var RAPIDS_DATASET_ROOT_DIR to a input .mtx file to use
+ * to populate the graph_t instance.
+ *
+ * To use the default values of rmm_mode=pool and
+ * dataset=test/datasets/karate.mtx:
+ * @code
+ *   RANDOM_WALKS_PROFILING
+ * @endcode
+ *
+ * To specify managed memory and the netscience.mtx dataset (relative to a
+ * particular RAPIDS_DATASET_ROOT_DIR setting):
+ * @code
+ *   RANDOM_WALKS_PROFILING --rmm_mode=managed --dataset=test/datasets/netscience.mtx
+ * @endcode
+ *
+ * @return An int representing a successful run. 0 indicates success.
+ */
+int main(int argc, char** argv)
+{
+  // Add command-line processing, provide defaults
+  cxxopts::Options options(argv[0], " - Random Walks benchmark command line options");
+  options.add_options()(
+    "rmm_mode", "RMM allocation mode", cxxopts::value<std::string>()->default_value("pool"));
+  options.add_options()(
+    "dataset", "dataset", cxxopts::value<std::string>()->default_value("test/datasets/karate.mtx"));
+  auto const cmd_options = options.parse(argc, argv);
+  auto const rmm_mode    = cmd_options["rmm_mode"].as<std::string>();
+  auto const dataset     = cmd_options["dataset"].as<std::string>();
+
+  // Configure RMM
+  auto resource = cugraph::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // Run benchmarks
+  std::cout << "Using dataset: " << dataset << std::endl;
+  run<int32_t, int32_t, float>(RandomWalks_Usecase(dataset, true));
+
+  // FIXME: consider returning non-zero for situations that warrant it (eg. if
+  // the algo ran but the results are invalid, if a benchmark threshold is
+  // exceeded, etc.)
+  return 0;
+}
diff --git a/cpp/tests/experimental/random_walks_test.cu b/cpp/tests/sampling/random_walks_test.cu
similarity index 99%
rename from cpp/tests/experimental/random_walks_test.cu
rename to cpp/tests/sampling/random_walks_test.cu
index d692f6a7592..9e4ecd0d024 100644
--- a/cpp/tests/experimental/random_walks_test.cu
+++ b/cpp/tests/sampling/random_walks_test.cu
@@ -24,8 +24,8 @@
 #include <thrust/random.h>
 
 #include <algorithms.hpp>
-#include <experimental/random_walks.cuh>
 #include <graph.hpp>
+#include <sampling/random_walks.cuh>
 
 #include <raft/handle.hpp>
 #include <raft/random/rng.cuh>
diff --git a/cpp/tests/experimental/random_walks_utils.cuh b/cpp/tests/sampling/random_walks_utils.cuh
similarity index 99%
rename from cpp/tests/experimental/random_walks_utils.cuh
rename to cpp/tests/sampling/random_walks_utils.cuh
index 863094dc310..b0b06e7f65a 100644
--- a/cpp/tests/experimental/random_walks_utils.cuh
+++ b/cpp/tests/sampling/random_walks_utils.cuh
@@ -16,8 +16,8 @@
 #pragma once
 
 #include <rmm/thrust_rmm_allocator.h>
-#include <experimental/random_walks.cuh>
 #include <graph.hpp>
+#include <sampling/random_walks.cuh>
 
 #include <raft/handle.hpp>
 
diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/sampling/rw_low_level_test.cu
similarity index 86%
rename from cpp/tests/experimental/rw_low_level_test.cu
rename to cpp/tests/sampling/rw_low_level_test.cu
index 8b562bc41f6..dd7fd14b3a2 100644
--- a/cpp/tests/experimental/rw_low_level_test.cu
+++ b/cpp/tests/sampling/rw_low_level_test.cu
@@ -24,8 +24,8 @@
 #include <thrust/random.h>
 
 #include <algorithms.hpp>
-#include <experimental/random_walks.cuh>
 #include <graph.hpp>
+#include <sampling/random_walks.cuh>
 
 #include <raft/handle.hpp>
 #include <raft/random/rng.cuh>
@@ -782,3 +782,121 @@ TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk)
 
   ASSERT_TRUE(test_all_paths);
 }
+
+TEST(RandomWalksSpecialCase, SingleRandomWalk)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges, true);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  bool test_all_paths =
+    cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}
+
+TEST(RandomWalksUtility, PathsToCOO)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  std::vector<index_t> v_sizes{2, 1, 3, 5, 1};
+  std::vector<vertex_t> v_coalesced{5, 3, 4, 9, 0, 1, 6, 2, 7, 3, 2, 5};
+  std::vector<weight_t> w_coalesced{0.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto num_paths = v_sizes.size();
+  auto total_sz  = v_coalesced.size();
+  auto num_edges = w_coalesced.size();
+
+  ASSERT_TRUE(num_edges == total_sz - num_paths);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(d_sizes.data(), v_sizes.data(), d_sizes.size(), handle.get_stream());
+
+  index_t coalesced_v_sz = d_coalesced_v.size();
+
+  auto tpl_coo_offsets = convert_paths_to_coo<vertex_t>(handle,
+                                                        coalesced_v_sz,
+                                                        static_cast<index_t>(num_paths),
+                                                        d_coalesced_v.release(),
+                                                        d_sizes.release());
+
+  auto&& d_src     = std::move(std::get<0>(tpl_coo_offsets));
+  auto&& d_dst     = std::move(std::get<1>(tpl_coo_offsets));
+  auto&& d_offsets = std::move(std::get<2>(tpl_coo_offsets));
+
+  ASSERT_TRUE(d_src.size() == num_edges);
+  ASSERT_TRUE(d_dst.size() == num_edges);
+
+  std::vector<vertex_t> v_src(num_edges, 0);
+  std::vector<vertex_t> v_dst(num_edges, 0);
+  std::vector<index_t> v_offsets(d_offsets.size(), 0);
+
+  raft::update_host(v_src.data(), raw_const_ptr(d_src), d_src.size(), handle.get_stream());
+  raft::update_host(v_dst.data(), raw_const_ptr(d_dst), d_dst.size(), handle.get_stream());
+  raft::update_host(
+    v_offsets.data(), raw_const_ptr(d_offsets), d_offsets.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_src_exp{5, 9, 0, 6, 2, 7, 3};
+  std::vector<vertex_t> v_dst_exp{3, 0, 1, 2, 7, 3, 2};
+  std::vector<index_t> v_offsets_exp{0, 1, 3};
+
+  EXPECT_EQ(v_src, v_src_exp);
+  EXPECT_EQ(v_dst, v_dst_exp);
+  EXPECT_EQ(v_offsets, v_offsets_exp);
+}
diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp
index 79a86e1fc95..770fbc99397 100644
--- a/cpp/tests/utilities/base_fixture.hpp
+++ b/cpp/tests/utilities/base_fixture.hpp
@@ -95,7 +95,7 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
   if (allocation_mode == "binning") return make_binning();
   if (allocation_mode == "cuda") return make_cuda();
   if (allocation_mode == "pool") return make_pool();
-  if (allocation_mode == "managed") make_managed();
+  if (allocation_mode == "managed") return make_managed();
   CUGRAPH_FAIL("Invalid RMM allocation mode");
 }