From f2e5a8755e18aadfc151ca65787dd4a3775efb85 Mon Sep 17 00:00:00 2001
From: Andrei Schaffer <37386037+aschaffer@users.noreply.github.com>
Date: Tue, 30 Mar 2021 13:09:33 -0500
Subject: [PATCH] Implement C/CUDA RandomWalks functionality (#1439)

This PR tracks work on issue: https://github.com/rapidsai/cugraph/issues/1380.

Authors:
  - Andrei Schaffer (@aschaffer)

Approvers:
  - Brad Rees (@BradReesWork)
  - Chuck Hastings (@ChuckHastings)
  - Seunghwa Kang (@seunghwak)

URL: https://github.com/rapidsai/cugraph/pull/1439
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/algorithms.hpp                    |  28 +
 cpp/include/utilities/cython.hpp              |  24 +
 cpp/include/utilities/graph_traits.hpp        |  61 ++
 cpp/src/experimental/random_walks.cuh         | 887 ++++++++++++++++++
 cpp/src/sampling/random_walks.cu              |  78 ++
 cpp/src/utilities/cython.cu                   |  76 ++
 cpp/tests/CMakeLists.txt                      |  14 +
 cpp/tests/experimental/random_walks_test.cu   | 152 +++
 cpp/tests/experimental/random_walks_utils.cuh | 152 +++
 cpp/tests/experimental/rw_low_level_test.cu   | 783 ++++++++++++++++
 11 files changed, 2256 insertions(+)
 create mode 100644 cpp/include/utilities/graph_traits.hpp
 create mode 100644 cpp/src/experimental/random_walks.cuh
 create mode 100644 cpp/src/sampling/random_walks.cu
 create mode 100644 cpp/tests/experimental/random_walks_test.cu
 create mode 100644 cpp/tests/experimental/random_walks_utils.cuh
 create mode 100644 cpp/tests/experimental/rw_low_level_test.cu
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 34ea935e31d..57f324a60a9 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -420,6 +420,7 @@ add_library(cugraph SHARED
     src/community/triangles_counting.cu
     src/community/extract_subgraph_by_vertex.cu
     src/community/egonet.cu
+    src/sampling/random_walks.cu
     src/cores/core_number.cu
     src/traversal/two_hop_neighbors.cu
     src/components/connectivity.cu
diff --git a/cpp/include/algorithms.hpp b/cpp/include/algorithms.hpp
index b8706d81e21..0b45b799357 100644
--- a/cpp/include/algorithms.hpp
+++ b/cpp/include/algorithms.hpp
@@ -1252,5 +1252,33 @@ extract_ego(raft::handle_t const &handle,
             vertex_t *source_vertex,
             vertex_t n_subgraphs,
             vertex_t radius);
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Uniform distribution is assumed for the random engine.
+ *
+ * @tparam graph_t Type of graph/view (typically, graph_view_t).
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph (view )object to generate RW on.
+ * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
+ * @param num_paths = number(paths).
+ * @param max_depth maximum length of RWs.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>> Triplet of coalesced RW paths, with corresponding edge weights for
+ * each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed to
+ * the Python layer. The meaning of "coalesced" here is that a 2D array of paths of different sizes
+ * is represented as a 1D array.
+ */
+template <typename graph_t, typename index_t>
+std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
+           rmm::device_uvector<typename graph_t::weight_type>,
+           rmm::device_uvector<index_t>>
+random_walks(raft::handle_t const &handle,
+             graph_t const &graph,
+             typename graph_t::vertex_type const *ptr_d_start,
+             index_t num_paths,
+             index_t max_depth);
 }  // namespace experimental
 }  // namespace cugraph
diff --git a/cpp/include/utilities/cython.hpp b/cpp/include/utilities/cython.hpp
index 98e850abbf0..a58331d465a 100644
--- a/cpp/include/utilities/cython.hpp
+++ b/cpp/include/utilities/cython.hpp
@@ -19,6 +19,7 @@
 #include <graph.hpp>
 #include <raft/handle.hpp>
 #include <rmm/device_uvector.hpp>
+#include <utilities/graph_traits.hpp>
 
 namespace cugraph {
 namespace cython {
@@ -185,6 +186,19 @@ struct major_minor_weights_t {
   rmm::device_uvector<weight_t> shuffled_weights_;
 };
 
+// aggregate for random_walks() return type
+// to be exposed to cython:
+//
+struct random_walk_ret_t {
+  size_t coalesced_sz_v_;
+  size_t coalesced_sz_w_;
+  size_t num_paths_;
+  size_t max_depth_;
+  std::unique_ptr<rmm::device_buffer> d_coalesced_v_;
+  std::unique_ptr<rmm::device_buffer> d_coalesced_w_;
+  std::unique_ptr<rmm::device_buffer> d_sizes_;
+};
+
 // wrapper for renumber_edgelist() return
 // (unrenumbering maps, etc.)
 //
@@ -442,6 +456,16 @@ std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
                                                   vertex_t* source_vertex,
                                                   vertex_t n_subgraphs,
                                                   vertex_t radius);
+// wrapper for random_walks.
+//
+template <typename vertex_t, typename edge_t>
+std::enable_if_t<cugraph::experimental::is_vertex_edge_combo<vertex_t, edge_t>::value,
+                 std::unique_ptr<random_walk_ret_t>>
+call_random_walks(raft::handle_t const& handle,
+                  graph_container_t const& graph_container,
+                  vertex_t const* ptr_start_set,
+                  edge_t num_paths,
+                  edge_t max_depth);
 
 // wrapper for shuffling:
 //
diff --git a/cpp/include/utilities/graph_traits.hpp b/cpp/include/utilities/graph_traits.hpp
new file mode 100644
index 00000000000..363a13190be
--- /dev/null
+++ b/cpp/include/utilities/graph_traits.hpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+// primary template:
+//
+template <typename Src, typename... Types>
+struct is_one_of;  // purposely empty
+
+// partial specializations:
+//
+template <typename Src, typename Head, typename... Tail>
+struct is_one_of<Src, Head, Tail...> {
+  static constexpr bool value = std::is_same<Src, Head>::value || is_one_of<Src, Tail...>::value;
+};
+
+template <typename Src>
+struct is_one_of<Src> {
+  static constexpr bool value = false;
+};
+
+// meta-function that constrains
+// vertex_t and edge_t template param candidates:
+//
+template <typename vertex_t, typename edge_t>
+struct is_vertex_edge_combo {
+  static constexpr bool value = is_one_of<vertex_t, int32_t, int64_t>::value &&
+                                is_one_of<edge_t, int32_t, int64_t>::value &&
+                                (sizeof(vertex_t) <= sizeof(edge_t));
+};
+
+// meta-function that constrains
+// all 3 template param candidates:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct is_candidate {
+  static constexpr bool value =
+    is_vertex_edge_combo<vertex_t, edge_t>::value && is_one_of<weight_t, float, double>::value;
+};
+
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/experimental/random_walks.cuh b/cpp/src/experimental/random_walks.cuh
new file mode 100644
index 00000000000..aea8f3d8420
--- /dev/null
+++ b/cpp/src/experimental/random_walks.cuh
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#pragma once
+
+#include <experimental/graph.hpp>
+
+#include <utilities/graph_utils.cuh>
+
+#include <raft/device_atomics.cuh>
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/copy.h>
+#include <thrust/count.h>
+#include <thrust/find.h>
+#include <thrust/gather.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/remove.h>
+#include <thrust/transform.h>
+#include <thrust/tuple.h>
+
+#include <cassert>
+#include <ctime>
+#include <tuple>
+#include <type_traits>
+
+namespace cugraph {
+namespace experimental {
+
+namespace detail {
+
+template <typename T>
+using device_vec_t = rmm::device_uvector<T>;
+
+template <typename T>
+using device_v_it = typename device_vec_t<T>::iterator;
+
+template <typename value_t>
+value_t* raw_ptr(device_vec_t<value_t>& dv)
+{
+  return dv.data();
+}
+
+template <typename value_t>
+value_t const* raw_const_ptr(device_vec_t<value_t> const& dv)
+{
+  return dv.data();
+}
+
+template <typename value_t, typename index_t = size_t>
+struct device_const_vector_view {
+  device_const_vector_view(value_t const* d_buffer, index_t size) : d_buffer_(d_buffer), size_(size)
+  {
+  }
+
+  device_const_vector_view(device_const_vector_view const& other) = delete;
+  device_const_vector_view& operator=(device_const_vector_view const& other) = delete;
+
+  device_const_vector_view(device_const_vector_view&& other)
+  {
+    d_buffer_ = other.d_buffer_;
+    size_     = other.size_;
+  }
+  device_const_vector_view& operator=(device_const_vector_view&& other)
+  {
+    d_buffer_ = other.d_buffer_;
+    size_     = other.size_;
+
+    return *this;
+  }
+
+  value_t const* begin(void) const { return d_buffer_; }
+
+  value_t const* end() const { return d_buffer_ + size_; }
+
+  index_t size(void) const { return size_; }
+
+ private:
+  value_t const* d_buffer_{nullptr};
+  index_t size_;
+};
+
+// raft random generator:
+// (using upper-bound cached "map"
+//  giving out_deg(v) for each v in [0, |V|);
+//  and a pre-generated vector of float random values
+//  in [0,1] to be brought into [0, d_ub[v]))
+//
+template <typename vertex_t,
+          typename edge_t,
+          typename seed_t  = uint64_t,
+          typename real_t  = float,
+          typename index_t = edge_t>
+struct rrandom_gen_t {
+  using seed_type = seed_t;
+  using real_type = real_t;
+
+  rrandom_gen_t(raft::handle_t const& handle,
+                index_t num_paths,
+                device_vec_t<real_t>& d_random,             // scratch-pad, non-coalesced
+                device_vec_t<edge_t> const& d_crt_out_deg,  // non-coalesced
+                seed_t seed = seed_t{})
+    : handle_(handle),
+      seed_(seed),
+      num_paths_(num_paths),
+      d_ptr_out_degs_(raw_const_ptr(d_crt_out_deg)),
+      d_ptr_random_(raw_ptr(d_random))
+  {
+    auto rnd_sz = d_random.size();
+
+    CUGRAPH_EXPECTS(rnd_sz >= static_cast<decltype(rnd_sz)>(num_paths),
+                    "Un-allocated random buffer.");
+
+    // done in constructor;
+    // this must be done at each step,
+    // but this object is constructed at each step;
+    //
+    raft::random::Rng rng(seed_);
+    rng.uniform<real_t, index_t>(
+      d_ptr_random_, num_paths, real_t{0.0}, real_t{1.0}, handle.get_stream());
+  }
+
+  // in place:
+  // for each v in [0, num_paths) {
+  // if out_deg(v) > 0
+  //   d_col_indx[v] = random index in [0, out_deg(v))
+  //}
+  void generate_col_indices(device_vec_t<vertex_t>& d_col_indx) const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+      d_ptr_random_,
+      d_ptr_random_ + num_paths_,  // input1
+      d_ptr_out_degs_,             // input2
+      d_ptr_out_degs_,             // also stencil
+      d_col_indx.begin(),
+      [] __device__(real_t rnd_vindx, edge_t crt_out_deg) {
+        real_t max_ub     = static_cast<real_t>(crt_out_deg - 1);
+        auto interp_vindx = rnd_vindx * max_ub + real_t{.5};
+        vertex_t v_indx   = static_cast<vertex_t>(interp_vindx);
+        return (v_indx >= crt_out_deg ? crt_out_deg - 1 : v_indx);
+      },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+  edge_t const* d_ptr_out_degs_;  // device buffer with out-deg of current set of vertices (most
+                                  // recent vertex in each path); size = num_paths_
+  real_t* d_ptr_random_;          // device buffer with real random values; size = num_paths_
+  seed_t seed_;                   // seed to be used for current batch
+};
+
+// seeding policy: time (clock) dependent,
+// to avoid RW calls repeating same random data:
+//
+template <typename seed_t>
+struct clock_seeding_t {
+  clock_seeding_t(void) = default;
+
+  seed_t operator()(void) { return static_cast<seed_t>(std::time(nullptr)); }
+};
+
+// seeding policy: fixed for debug/testing repro
+//
+template <typename seed_t>
+struct fixed_seeding_t {
+  // purposely no default cnstr.
+
+  fixed_seeding_t(seed_t seed) : seed_(seed) {}
+  seed_t operator()(void) { return seed_; }
+
+ private:
+  seed_t seed_;
+};
+
+// classes abstracting the next vertex extraction mechanism:
+//
+// primary template, purposely undefined
+template <typename graph_t,
+          typename index_t  = typename graph_t::edge_type,
+          typename enable_t = void>
+struct col_indx_extract_t;
+
+// specialization for single-gpu functionality:
+//
+template <typename graph_t, typename index_t>
+struct col_indx_extract_t<graph_t, index_t, std::enable_if_t<graph_t::is_multi_gpu == false>> {
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+
+  col_indx_extract_t(raft::handle_t const& handle,
+                     device_vec_t<vertex_t> const& d_indices,
+                     device_vec_t<edge_t> const& d_offsets,
+                     device_vec_t<weight_t> const& d_values,
+                     device_vec_t<edge_t> const& d_crt_out_degs,
+                     device_vec_t<index_t> const& d_sizes,
+                     index_t num_paths,
+                     index_t max_depth)
+    : handle_(handle),
+      col_indices_(raw_const_ptr(d_indices)),
+      row_offsets_(raw_const_ptr(d_offsets)),
+      values_(raw_const_ptr(d_values)),
+      out_degs_(raw_const_ptr(d_crt_out_degs)),
+      sizes_(raw_const_ptr(d_sizes)),
+      num_paths_(num_paths),
+      max_depth_(max_depth)
+  {
+  }
+
+  col_indx_extract_t(raft::handle_t const& handle,
+                     vertex_t const* p_d_indices,
+                     edge_t const* p_d_offsets,
+                     weight_t const* p_d_values,
+                     edge_t const* p_d_crt_out_degs,
+                     index_t const* p_d_sizes,
+                     index_t num_paths,
+                     index_t max_depth)
+    : handle_(handle),
+      col_indices_(p_d_indices),
+      row_offsets_(p_d_offsets),
+      values_(p_d_values),
+      out_degs_(p_d_crt_out_degs),
+      sizes_(p_d_sizes),
+      num_paths_(num_paths),
+      max_depth_(max_depth)
+  {
+  }
+
+  col_indx_extract_t(raft::handle_t const& handle,
+                     graph_t const& graph,
+                     edge_t const* p_d_crt_out_degs,
+                     index_t const* p_d_sizes,
+                     index_t num_paths,
+                     index_t max_depth)
+    : handle_(handle),
+      col_indices_(graph.indices()),
+      row_offsets_(graph.offsets()),
+      values_(graph.weights()),
+      out_degs_(p_d_crt_out_degs),
+      sizes_(p_d_sizes),
+      num_paths_(num_paths),
+      max_depth_(max_depth)
+  {
+  }
+
+  // in-place extractor of next set of vertices and weights,
+  // (d_v_next_vertices, d_v_next_weights),
+  // given start set of vertices. d_v_src_vertices,
+  // and corresponding column index set, d_v_col_indx:
+  //
+  // for each indx in [0, num_paths){
+  //   v_indx = d_v_src_vertices[indx*max_depth + d_sizes[indx] - 1];
+  //   if( out_degs_[v_indx] > 0 ) {
+  //      start_row = row_offsets_[v_indx];
+  //      delta = d_v_col_indx[indx];
+  //      d_v_next_vertices[indx] = col_indices_[start_row + delta];
+  // }
+  // (use tranform_if() with transform iterator)
+  //
+  void operator()(
+    device_vec_t<vertex_t> const& d_coalesced_src_v,  // in: coalesced vector of vertices
+    device_vec_t<vertex_t> const&
+      d_v_col_indx,  // in: column indices, given by stepper's random engine
+    device_vec_t<vertex_t>& d_v_next_vertices,  // out: set of destination vertices, for next step
+    device_vec_t<weight_t>&
+      d_v_next_weights)  // out: set of weights between src and destination vertices, for next step
+    const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+      thrust::make_counting_iterator<index_t>(0),
+      thrust::make_counting_iterator<index_t>(num_paths_),  // input1
+      d_v_col_indx.begin(),                                 // input2
+      out_degs_,                                            // stencil
+      thrust::make_zip_iterator(
+        thrust::make_tuple(d_v_next_vertices.begin(), d_v_next_weights.begin())),  // output
+      [max_depth         = max_depth_,
+       ptr_d_sizes       = sizes_,
+       ptr_d_coalesced_v = raw_const_ptr(d_coalesced_src_v),
+       row_offsets       = row_offsets_,
+       col_indices       = col_indices_,
+       values            = values_] __device__(auto indx, auto col_indx) {
+        auto delta     = ptr_d_sizes[indx] - 1;
+        auto v_indx    = ptr_d_coalesced_v[indx * max_depth + delta];
+        auto start_row = row_offsets[v_indx];
+        return thrust::make_tuple(col_indices[start_row + col_indx], values[start_row + col_indx]);
+      },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+ private:
+  raft::handle_t const& handle_;
+  vertex_t const* col_indices_;
+  edge_t const* row_offsets_;
+  weight_t const* values_;
+
+  edge_t const* out_degs_;
+  index_t const* sizes_;
+  index_t num_paths_;
+  index_t max_depth_;
+};
+
+/**
+ * @brief Class abstracting the RW initialization, stepping, and stopping functionality
+ *        The outline of the algorithm is as follows:
+ *
+ *        (1) vertex sets are coalesced into d_coalesced_v,
+ *            weight sets are coalesced into d_coalesced_w;
+ *            i.e., the 2 coalesced vectors are allocated to
+ *            num_paths * max_depth, and num_paths * (max_depth -1), respectively
+ *            (since each path has a number of edges equal one
+ *             less than the number of vertices);
+ *            d_coalesced_v is initialized for each i*max_depth entry
+ *            (i=0,,,,num_paths-1) to the corresponding starting vertices;
+ *        (2) d_sizes maintains the current size is for each path;
+ *            Note that a path may end prematurely if it reaches a sink vertex;
+ *        (3) d_crt_out_degs maintains the out-degree of each of the latest
+ *            vertices in the path; i.e., if N(v) := set of destination
+ *            vertices from v, then this vector stores |N(v)|
+ *            for last v in each path; i.e.,
+ *            d_crt_out_degs[i] =
+ *              out-degree( d_coalesced_v[i*max_depth + d_sizes[i]-1] ),
+ *            for i in {0,..., num_paths-1};
+ *        (4) a set of num_paths floating point numbers between [0,1]
+ *            are generated at each step; then they get translated into
+ *            _indices_ k in {0,...d_crt_out_degs[i]-1};
+ *        (5) the next vertex v is then picked as the k-th out-neighbor:
+ *            next(v) = N(v)[k];
+ *        (6) d_sizes are incremented accordingly; i.e., for those paths whose
+ *            corresponding last vertex has out-degree > 0;
+ *        (7) then next(v) and corresponding weight of (v, next(v)) are stored
+ *            at appropriate location in their corresponding coalesced vectors;
+ *        (8) the client of this class (the random_walks() function) then repeats
+ *            this process max_depth times or until all paths
+ *            have reached sinks; i.e., d_crt_out_degs = {0, 0,...,0},
+ *            whichever comes first;
+ *        (9) in the end some post-processing is done (stop()) to remove
+ *            unused entries from the 2 coalesced vectors;
+ *        (10) the triplet made of the 2 coalesced vectors and d_sizes is then returned;
+ *
+ */
+template <typename graph_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename index_t = typename graph_t::edge_type>
+struct random_walker_t {
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+  using seed_t   = typename random_engine_t::seed_type;
+  using real_t   = typename random_engine_t::real_type;
+
+  random_walker_t(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  index_t num_paths,
+                  index_t max_depth)
+    : handle_(handle),
+      num_paths_(num_paths),
+      max_depth_(max_depth),
+      d_cached_out_degs_(graph.compute_out_degrees(handle_))
+  {
+  }
+
+  // for each i in [0..num_paths_) {
+  //   d_paths_v_set[i*max_depth] = d_src_init_v[i];
+  //
+  void start(device_const_vector_view<vertex_t, index_t>& d_src_init_v,  // in: start set
+             device_vec_t<vertex_t>& d_paths_v_set,                      // out: coalesced v
+             device_vec_t<index_t>& d_sizes) const  // out: init sizes to {1,...}
+  {
+    // intialize path sizes to 1, as they contain at least one vertex each:
+    // the initial set: d_src_init_v;
+    //
+    thrust::copy_n(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                   thrust::make_constant_iterator<index_t>(1),
+                   num_paths_,
+                   d_sizes.begin());
+
+    // scatter d_src_init_v to coalesced vertex vector:
+    //
+    auto dlambda = [stride = max_depth_] __device__(auto indx) { return indx * stride; };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::scatter(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                    d_src_init_v.begin(),
+                    d_src_init_v.end(),
+                    map_it_begin,
+                    d_paths_v_set.begin());
+  }
+
+  // overload for start() with device_uvector d_v_start
+  // (handy for testing)
+  //
+  void start(device_vec_t<vertex_t> const& d_start,  // in: start set
+             device_vec_t<vertex_t>& d_paths_v_set,  // out: coalesced v
+             device_vec_t<index_t>& d_sizes) const   // out: init sizes to {1,...}
+  {
+    device_const_vector_view<vertex_t, index_t> d_start_cview{d_start.data(),
+                                                              static_cast<index_t>(d_start.size())};
+
+    start(d_start_cview, d_paths_v_set, d_sizes);
+  }
+
+  // in-place updates its arguments from one step to next
+  // (to avoid copying); all "crt" arguments are updated at each step()
+  // and passed as scratchpad space to avoid copying them
+  // from one step to another
+  //
+  // take one step in sync for all paths that have not reached sinks:
+  //
+  void step(
+    graph_t const& graph,
+    seed_t seed,
+    device_vec_t<vertex_t>& d_coalesced_v,  // crt coalesced vertex set
+    device_vec_t<weight_t>& d_coalesced_w,  // crt coalesced weight set
+    device_vec_t<index_t>& d_paths_sz,      // crt paths sizes
+    device_vec_t<edge_t>& d_crt_out_degs,   // crt out-degs for current set of vertices
+    device_vec_t<real_t>& d_random,         // crt set of random real values
+    device_vec_t<vertex_t>& d_col_indx,  // crt col col indices to be used for retrieving next step
+    device_vec_t<vertex_t>& d_next_v,    // crt set of destination vertices, for next step
+    device_vec_t<weight_t>& d_next_w)
+    const  // set of weights between src and destination vertices, for next step
+  {
+    // update crt snapshot of out-degs,
+    // from cached out degs, using
+    // latest vertex in each path as source:
+    //
+    gather_from_coalesced(
+      d_coalesced_v, d_cached_out_degs_, d_paths_sz, d_crt_out_degs, max_depth_, num_paths_);
+
+    // generate random destination indices:
+    //
+    random_engine_t rgen(handle_, num_paths_, d_random, d_crt_out_degs, seed);
+
+    rgen.generate_col_indices(d_col_indx);
+
+    // dst extraction from dst indices:
+    //
+    col_indx_extract_t<graph_t> col_extractor(handle_,
+                                              graph,
+                                              raw_const_ptr(d_crt_out_degs),
+                                              raw_const_ptr(d_paths_sz),
+                                              num_paths_,
+                                              max_depth_);
+
+    // The following steps update the next entry in each path,
+    // except the paths that reached sinks;
+    //
+    // for each indx in [0..num_paths) {
+    //   v_indx = d_v_rnd_n_indx[indx];
+    //
+    //   -- get the `v_indx`-th out-vertex of d_v_paths_v_set[indx] vertex:
+    //   -- also, note the size deltas increased by 1 in dst (d_sizes[]):
+    //
+    //   d_coalesced_v[indx*num_paths + d_sizes[indx]] =
+    //       get_out_vertex(graph, d_coalesced_v[indx*num_paths + d_sizes[indx] -1)], v_indx);
+    //   d_coalesced_w[indx*(num_paths-1) + d_sizes[indx] - 1] =
+    //       get_out_edge_weight(graph, d_coalesced_v[indx*num_paths + d_sizes[indx]-2], v_indx);
+    //
+    // (1) generate actual vertex destinations:
+    //
+    col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+    // (2) update path sizes:
+    //
+    update_path_sizes(d_crt_out_degs, d_paths_sz);
+
+    // (3) actual coalesced updates:
+    //
+    scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_paths_sz);
+    scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_paths_sz);
+  }
+
+  // returns true if all paths reached sinks:
+  //
+  bool all_paths_stopped(device_vec_t<edge_t> const& d_crt_out_degs) const
+  {
+    auto how_many_stopped =
+      thrust::count_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                       d_crt_out_degs.begin(),
+                       d_crt_out_degs.end(),
+                       [] __device__(auto crt_out_deg) { return crt_out_deg == 0; });
+    return (static_cast<size_t>(how_many_stopped) == d_crt_out_degs.size());
+  }
+
+  // wrap-up, post-process:
+  // truncate v_set, w_set to actual space used
+  //
+  void stop(device_vec_t<vertex_t>& d_coalesced_v,       // coalesced vertex set
+            device_vec_t<weight_t>& d_coalesced_w,       // coalesced weight set
+            device_vec_t<index_t> const& d_sizes) const  // paths sizes
+  {
+    assert(max_depth_ > 1);  // else, no need to step; and no edges
+
+    index_t const* ptr_d_sizes = d_sizes.data();
+
+    auto predicate_v = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) {
+      auto row_indx = indx / max_depth;
+      auto col_indx = indx % max_depth;
+
+      return (col_indx >= ptr_d_sizes[row_indx]);
+    };
+
+    auto predicate_w = [max_depth = max_depth_, ptr_d_sizes] __device__(auto indx) {
+      auto row_indx = indx / (max_depth - 1);
+      auto col_indx = indx % (max_depth - 1);
+
+      return (col_indx >= ptr_d_sizes[row_indx] - 1);
+    };
+
+    auto new_end_v =
+      thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                        d_coalesced_v.begin(),
+                        d_coalesced_v.end(),
+                        thrust::make_counting_iterator<index_t>(0),
+                        predicate_v);
+
+    auto new_end_w =
+      thrust::remove_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                        d_coalesced_w.begin(),
+                        d_coalesced_w.end(),
+                        thrust::make_counting_iterator<index_t>(0),
+                        predicate_w);
+
+    CUDA_TRY(cudaStreamSynchronize(handle_.get_stream()));
+
+    d_coalesced_v.resize(thrust::distance(d_coalesced_v.begin(), new_end_v), handle_.get_stream());
+    d_coalesced_w.resize(thrust::distance(d_coalesced_w.begin(), new_end_w), handle_.get_stream());
+  }
+
+  // in-place non-static (needs handle_):
+  // for indx in [0, nelems):
+  //   gather d_result[indx] = d_src[d_coalesced[indx*stride + d_sizes[indx] -1]]
+  //
+  template <typename src_vec_t = vertex_t>
+  void gather_from_coalesced(
+    device_vec_t<vertex_t> const& d_coalesced,  // |gather map| = stride*nelems
+    device_vec_t<src_vec_t> const& d_src,       // |gather input| = nelems
+    device_vec_t<index_t> const& d_sizes,       // |paths sizes| = nelems, elems in [1, stride]
+    device_vec_t<src_vec_t>& d_result,          // |output| = nelems
+    index_t stride,        // stride = coalesce block size (typically max_depth)
+    index_t nelems) const  // nelems = number of elements to gather (typically num_paths_)
+  {
+    vertex_t const* ptr_d_coalesced = raw_const_ptr(d_coalesced);
+    index_t const* ptr_d_sizes      = raw_const_ptr(d_sizes);
+
+    // delta = ptr_d_sizes[indx] - 1
+    //
+    auto dlambda = [stride, ptr_d_sizes, ptr_d_coalesced] __device__(auto indx) {
+      auto delta = ptr_d_sizes[indx] - 1;
+      return ptr_d_coalesced[indx * stride + delta];
+    };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::gather(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                   map_it_begin,
+                   map_it_begin + nelems,
+                   d_src.begin(),
+                   d_result.begin());
+  }
+
+  // in-place non-static (needs handle_);
+  // pre-condition: path sizes are assumed updated
+  // to reflect new vertex additions;
+  //
+  // for indx in [0, nelems):
+  //   if ( d_crt_out_degs[indx] > 0 )
+  //     d_coalesced[indx*stride + (d_sizes[indx] - adjust)- 1] = d_src[indx]
+  //
+  // adjust := 0 for coalesced vertices; 1 for weights
+  // (because |edges| = |vertices| - 1, in each path);
+  //
+  template <typename src_vec_t>
+  void scatter_to_coalesced(
+    device_vec_t<src_vec_t> const& d_src,        // |scatter input| = nelems
+    device_vec_t<src_vec_t>& d_coalesced,        // |scatter input| = stride*nelems
+    device_vec_t<edge_t> const& d_crt_out_degs,  // |current set of vertex out degrees| = nelems,
+                                                 // to be used as stencil (don't scatter if 0)
+    device_vec_t<index_t> const&
+      d_sizes,  // paths sizes used to provide delta in coalesced paths;
+                // pre-condition: assumed as updated to reflect new vertex additions;
+                // also, this is the number of _vertices_ in each path;
+    // hence for scattering weights this needs to be adjusted; hence the `adjust` parameter
+    index_t
+      stride,  // stride = coalesce block size (max_depth for vertices; max_depth-1 for weights)
+    index_t nelems,  // nelems = number of elements to gather (typically num_paths_)
+    index_t adjust = 0)
+    const  // adjusting parameter for scattering vertices (0) or weights (1); see above for more;
+  {
+    index_t const* ptr_d_sizes = raw_const_ptr(d_sizes);
+
+    auto dlambda = [stride, adjust, ptr_d_sizes] __device__(auto indx) {
+      auto delta = ptr_d_sizes[indx] - adjust - 1;
+      return indx * stride + delta;
+    };
+
+    // use the transform iterator as map:
+    //
+    auto map_it_begin =
+      thrust::make_transform_iterator(thrust::make_counting_iterator<index_t>(0), dlambda);
+
+    thrust::scatter_if(rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+                       d_src.begin(),
+                       d_src.end(),
+                       map_it_begin,
+                       d_crt_out_degs.begin(),
+                       d_coalesced.begin(),
+                       [] __device__(auto crt_out_deg) {
+                         return crt_out_deg > 0;  // predicate
+                       });
+  }
+
+  // updates the entries in the corresponding coalesced vector,
+  // for which out_deg > 0
+  //
+  void scatter_vertices(device_vec_t<vertex_t> const& d_src,
+                        device_vec_t<vertex_t>& d_coalesced,
+                        device_vec_t<edge_t> const& d_crt_out_degs,
+                        device_vec_t<index_t> const& d_sizes) const
+  {
+    scatter_to_coalesced(d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_, num_paths_);
+  }
+  //
+  void scatter_weights(device_vec_t<weight_t> const& d_src,
+                       device_vec_t<weight_t>& d_coalesced,
+                       device_vec_t<edge_t> const& d_crt_out_degs,
+                       device_vec_t<index_t> const& d_sizes) const
+  {
+    scatter_to_coalesced(
+      d_src, d_coalesced, d_crt_out_degs, d_sizes, max_depth_ - 1, num_paths_, 1);
+  }
+
+  // in-place update (increment) path sizes for paths
+  // that have not reached a sink; i.e., for which
+  // d_crt_out_degs[indx]>0:
+  //
+  void update_path_sizes(device_vec_t<edge_t> const& d_crt_out_degs,
+                         device_vec_t<index_t>& d_sizes) const
+  {
+    thrust::transform_if(
+      rmm::exec_policy(handle_.get_stream())->on(handle_.get_stream()),
+      d_sizes.begin(),
+      d_sizes.end(),           // input
+      d_crt_out_degs.begin(),  // stencil
+      d_sizes.begin(),         // output: in-place
+      [] __device__(auto crt_sz) { return crt_sz + 1; },
+      [] __device__(auto crt_out_deg) { return crt_out_deg > 0; });
+  }
+
+  device_vec_t<edge_t> const& get_out_degs(void) const { return d_cached_out_degs_; }
+
+ private:
+  raft::handle_t const& handle_;
+  index_t num_paths_;
+  index_t max_depth_;
+  device_vec_t<edge_t> d_cached_out_degs_;
+};
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Single-GPU specialization.
+ *
+ * @tparam graph_t Type of graph (view).
+ * @tparam random_engine_t Type of random engine used to generate RW.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph object to generate RW on.
+ * @param d_v_start Device (view) set of starting vertex indices for the RW.
+ * number(paths) == d_v_start.size().
+ * @param max_depth maximum length of RWs.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>, seed> Quadruplet of coalesced RW paths, with corresponding edge weights
+ * for each, and corresponding path sizes. This is meant to minimize the number of DF's to be passed
+ * to the Python layer. Also returning seed for testing / debugging repro. The meaning of
+ * "coalesced" here is that a 2D array of paths of different sizes is represented as a 1D array.
+ */
+template <typename graph_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename seeding_policy_t = clock_seeding_t<typename random_engine_t::seed_type>,
+          typename index_t          = typename graph_t::edge_type>
+std::enable_if_t<graph_t::is_multi_gpu == false,
+                 std::tuple<device_vec_t<typename graph_t::vertex_type>,
+                            device_vec_t<typename graph_t::weight_type>,
+                            device_vec_t<index_t>,
+                            typename random_engine_t::seed_type>>
+random_walks_impl(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  device_const_vector_view<typename graph_t::vertex_type, index_t>& d_v_start,
+                  index_t max_depth,
+                  seeding_policy_t seeder = clock_seeding_t<typename random_engine_t::seed_type>{})
+{
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t   = typename graph_t::edge_type;
+  using weight_t = typename graph_t::weight_type;
+  using seed_t   = typename random_engine_t::seed_type;
+  using real_t   = typename random_engine_t::real_type;
+
+  vertex_t num_vertices = graph.get_number_of_vertices();
+
+  auto how_many_valid =
+    thrust::count_if(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                     d_v_start.begin(),
+                     d_v_start.end(),
+                     [num_vertices] __device__(auto crt_vertex) {
+                       return (crt_vertex >= 0) && (crt_vertex < num_vertices);
+                     });
+
+  CUGRAPH_EXPECTS(static_cast<index_t>(how_many_valid) == d_v_start.size(),
+                  "Invalid set of starting vertices.");
+
+  auto num_paths = d_v_start.size();
+  auto stream    = handle.get_stream();
+
+  random_walker_t<graph_t, random_engine_t> rand_walker{
+    handle, graph, static_cast<index_t>(num_paths), static_cast<index_t>(max_depth)};
+
+  // pre-allocate num_paths * max_depth;
+  //
+  auto coalesced_sz = num_paths * max_depth;
+  device_vec_t<vertex_t> d_coalesced_v(coalesced_sz, stream);  // coalesced vertex set
+  device_vec_t<weight_t> d_coalesced_w(coalesced_sz, stream);  // coalesced weight set
+  device_vec_t<index_t> d_paths_sz(num_paths, stream);         // paths sizes
+  device_vec_t<edge_t> d_crt_out_degs(num_paths, stream);  // out-degs for current set of vertices
+  device_vec_t<real_t> d_random(num_paths, stream);
+  device_vec_t<vertex_t> d_col_indx(num_paths, stream);
+  device_vec_t<vertex_t> d_next_v(num_paths, stream);
+  device_vec_t<weight_t> d_next_w(num_paths, stream);
+
+  // abstracted out seed initialization:
+  //
+  seed_t seed0 = static_cast<seed_t>(seeder());
+
+  // very first vertex, for each path:
+  //
+  rand_walker.start(d_v_start, d_coalesced_v, d_paths_sz);
+
+  // start from 1, as 0-th was initialized above:
+  //
+  for (decltype(max_depth) step_indx = 1; step_indx < max_depth; ++step_indx) {
+    // take one-step in-sync for each path in parallel:
+    //
+    rand_walker.step(graph,
+                     seed0 + static_cast<seed_t>(step_indx),
+                     d_coalesced_v,
+                     d_coalesced_w,
+                     d_paths_sz,
+                     d_crt_out_degs,
+                     d_random,
+                     d_col_indx,
+                     d_next_v,
+                     d_next_w);
+
+    // early exit: all paths have reached sinks:
+    //
+    if (rand_walker.all_paths_stopped(d_crt_out_degs)) break;
+  }
+
+  // wrap-up, post-process:
+  // truncate v_set, w_set to actual space used
+  //
+  rand_walker.stop(d_coalesced_v, d_coalesced_w, d_paths_sz);
+
+  // because device_uvector is not copy-cnstr-able:
+  //
+  return std::make_tuple(std::move(d_coalesced_v),
+                         std::move(d_coalesced_w),
+                         std::move(d_paths_sz),
+                         seed0);  // also return seed for repro
+}
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Multi-GPU specialization.
+ *
+ * @tparam graph_t Type of graph (view).
+ * @tparam random_engine_t Type of random engine used to generate RW.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph object to generate RW on.
+ * @param d_v_start Device (view) set of starting vertex indices for the RW. number(RW) ==
+ * d_v_start.size().
+ * @param max_depth maximum length of RWs.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>, seed> Quadruplet of coalesced RW paths, with corresponding edge weights
+ * for each, and coresponding path sizes. This is meant to minimize the number of DF's to be passed
+ * to the Python layer. Also returning seed for testing / debugging repro. The meaning of
+ * "coalesced" here is that a 2D array of paths of different sizes is represented as a 1D array.
+ */
+template <typename graph_t,
+          typename random_engine_t =
+            rrandom_gen_t<typename graph_t::vertex_type, typename graph_t::edge_type>,
+          typename seeding_policy_t = clock_seeding_t<typename random_engine_t::seed_type>,
+          typename index_t          = typename graph_t::edge_type>
+std::enable_if_t<graph_t::is_multi_gpu == true,
+                 std::tuple<device_vec_t<typename graph_t::vertex_type>,
+                            device_vec_t<typename graph_t::weight_type>,
+                            device_vec_t<index_t>,
+                            typename random_engine_t::seed_type>>
+random_walks_impl(raft::handle_t const& handle,
+                  graph_t const& graph,
+                  device_const_vector_view<typename graph_t::vertex_type, index_t>& d_v_start,
+                  index_t max_depth,
+                  seeding_policy_t seeder = clock_seeding_t<typename random_engine_t::seed_type>{})
+{
+  CUGRAPH_FAIL("Not implemented yet.");
+}
+
+}  // namespace detail
+
+/**
+ * @brief returns random walks (RW) from starting sources, where each path is of given maximum
+ * length. Uniform distribution is assumed for the random engine.
+ *
+ * @tparam graph_t Type of graph (view).
+ * @tparam index_t Type used to store indexing and sizes.
+ * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and
+ * handles to various CUDA libraries) to run graph algorithms.
+ * @param graph Graph object to generate RW on.
+ * @param ptr_d_start Device pointer to set of starting vertex indices for the RW.
+ * @param num_paths = number(paths).
+ * @param max_depth maximum length of RWs.
+ * @return std::tuple<device_vec_t<vertex_t>, device_vec_t<weight_t>,
+ * device_vec_t<index_t>> Triplet of coalesced RW paths, with corresponding edge weights for
+ * each, and coresponding path sizes. This is meant to minimize the number of DF's to be passed to
+ * the Python layer.
+ */
+template <typename graph_t, typename index_t>
+std::tuple<rmm::device_uvector<typename graph_t::vertex_type>,
+           rmm::device_uvector<typename graph_t::weight_type>,
+           rmm::device_uvector<index_t>>
+random_walks(raft::handle_t const& handle,
+             graph_t const& graph,
+             typename graph_t::vertex_type const* ptr_d_start,
+             index_t num_paths,
+             index_t max_depth)
+{
+  using vertex_t = typename graph_t::vertex_type;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_v_start{ptr_d_start, num_paths};
+
+  auto quad_tuple = detail::random_walks_impl(handle, graph, d_v_start, max_depth);
+  // ignore last element of the quad, seed,
+  // since it's meant for testing / debugging, only:
+  //
+  return std::make_tuple(std::move(std::get<0>(quad_tuple)),
+                         std::move(std::get<1>(quad_tuple)),
+                         std::move(std::get<2>(quad_tuple)));
+}
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/sampling/random_walks.cu b/cpp/src/sampling/random_walks.cu
new file mode 100644
index 00000000000..88d5d9ed5c8
--- /dev/null
+++ b/cpp/src/sampling/random_walks.cu
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Andrei Schaffer, aschaffer@nvidia.com
+//
+#include <algorithms.hpp>
+#include <experimental/random_walks.cuh>
+
+namespace cugraph {
+namespace experimental {
+// template explicit instantiation directives (EIDir's):
+//
+// SG FP32{
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>, rmm::device_uvector<int32_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int32_t, float, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int32_t num_paths,
+               int32_t max_depth);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int64_t, float, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<float>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int64_t, int64_t, float, false, false> const& gview,
+               int64_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth);
+//}
+//
+// SG FP64{
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>, rmm::device_uvector<int32_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int32_t, double, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int32_t num_paths,
+               int32_t max_depth);
+
+template std::
+  tuple<rmm::device_uvector<int32_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int32_t, int64_t, double, false, false> const& gview,
+               int32_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth);
+
+template std::
+  tuple<rmm::device_uvector<int64_t>, rmm::device_uvector<double>, rmm::device_uvector<int64_t>>
+  random_walks(raft::handle_t const& handle,
+               graph_view_t<int64_t, int64_t, double, false, false> const& gview,
+               int64_t const* ptr_d_start,
+               int64_t num_paths,
+               int64_t max_depth);
+//}
+}  // namespace experimental
+}  // namespace cugraph
diff --git a/cpp/src/utilities/cython.cu b/cpp/src/utilities/cython.cu
index 5382b4856f3..a9e3146bbcd 100644
--- a/cpp/src/utilities/cython.cu
+++ b/cpp/src/utilities/cython.cu
@@ -696,6 +696,61 @@ std::unique_ptr<cy_multi_edgelists_t> call_egonet(raft::handle_t const& handle,
   }
 }
 
+// Wrapper for random_walks() through a graph container
+// to expose the API to cython.
+//
+template <typename vertex_t, typename edge_t>
+std::enable_if_t<cugraph::experimental::is_vertex_edge_combo<vertex_t, edge_t>::value,
+                 std::unique_ptr<random_walk_ret_t>>
+call_random_walks(raft::handle_t const& handle,
+                  graph_container_t const& graph_container,
+                  vertex_t const* ptr_start_set,
+                  edge_t num_paths,
+                  edge_t max_depth)
+{
+  if (graph_container.weightType == numberTypeEnum::floatType) {
+    using weight_t = float;
+
+    auto graph =
+      detail::create_graph<vertex_t, edge_t, weight_t, false, false>(handle, graph_container);
+
+    auto triplet = cugraph::experimental::random_walks(
+      handle, graph->view(), ptr_start_set, num_paths, max_depth);
+
+    random_walk_ret_t rw_tri{std::get<0>(triplet).size(),
+                             std::get<1>(triplet).size(),
+                             static_cast<size_t>(num_paths),
+                             static_cast<size_t>(max_depth),
+                             std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+    return std::make_unique<random_walk_ret_t>(std::move(rw_tri));
+
+  } else if (graph_container.weightType == numberTypeEnum::doubleType) {
+    using weight_t = double;
+
+    auto graph =
+      detail::create_graph<vertex_t, edge_t, weight_t, false, false>(handle, graph_container);
+
+    auto triplet = cugraph::experimental::random_walks(
+      handle, graph->view(), ptr_start_set, num_paths, max_depth);
+
+    random_walk_ret_t rw_tri{std::get<0>(triplet).size(),
+                             std::get<1>(triplet).size(),
+                             static_cast<size_t>(num_paths),
+                             static_cast<size_t>(max_depth),
+                             std::make_unique<rmm::device_buffer>(std::get<0>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<1>(triplet).release()),
+                             std::make_unique<rmm::device_buffer>(std::get<2>(triplet).release())};
+
+    return std::make_unique<random_walk_ret_t>(std::move(rw_tri));
+
+  } else {
+    CUGRAPH_FAIL("Unsupported weight type.");
+  }
+}
+
 // Wrapper for calling SSSP through a graph container
 template <typename vertex_t, typename weight_t>
 void call_sssp(raft::handle_t const& handle,
@@ -1038,6 +1093,27 @@ template std::unique_ptr<cy_multi_edgelists_t> call_egonet<int64_t, double>(
   int64_t n_subgraphs,
   int64_t radius);
 
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int32_t, int32_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t const* ptr_start_set,
+  int32_t num_paths,
+  int32_t max_depth);
+
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int32_t, int64_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int32_t const* ptr_start_set,
+  int64_t num_paths,
+  int64_t max_depth);
+
+template std::unique_ptr<random_walk_ret_t> call_random_walks<int64_t, int64_t>(
+  raft::handle_t const& handle,
+  graph_container_t const& graph_container,
+  int64_t const* ptr_start_set,
+  int64_t num_paths,
+  int64_t max_depth);
+
 template void call_sssp(raft::handle_t const& handle,
                         graph_container_t const& graph_container,
                         int32_t* identifiers,
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5292f9f9997..3b65b0edb29 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -415,6 +415,20 @@ set(EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS
 
 ConfigureTest(EXPERIMENTAL_KATZ_CENTRALITY_TEST "${EXPERIMENTAL_KATZ_CENTRALITY_TEST_SRCS}")
 
+###################################################################################################
+# - Experimental RANDOM_WALKS tests ------------------------------------------------------------
+
+set(EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/random_walks_test.cu")
+
+ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_TEST "${EXPERIMENTAL_RANDOM_WALKS_TEST_SRCS}")
+
+###################################################################################################
+set(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS
+    "${CMAKE_CURRENT_SOURCE_DIR}/experimental/rw_low_level_test.cu")
+
+ConfigureTest(EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_TEST "${EXPERIMENTAL_RANDOM_WALKS_LOW_LEVEL_SRCS}")
+
 
 ###################################################################################################
 # - MG tests --------------------------------------------------------------------------------------
diff --git a/cpp/tests/experimental/random_walks_test.cu b/cpp/tests/experimental/random_walks_test.cu
new file mode 100644
index 00000000000..9fb1716f62b
--- /dev/null
+++ b/cpp/tests/experimental/random_walks_test.cu
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cuda_profiler_api.h"
+#include "gtest/gtest.h"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/random.h>
+
+#include <algorithms.hpp>
+#include <experimental/random_walks.cuh>
+#include <graph.hpp>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include "random_walks_utils.cuh"
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <utilities/high_res_timer.hpp>
+#include <vector>
+
+namespace {  // anonym.
+template <typename vertex_t, typename index_t>
+void fill_start(raft::handle_t const& handle,
+                rmm::device_uvector<vertex_t>& d_start,
+                index_t num_vertices)
+{
+  index_t num_paths = d_start.size();
+
+  thrust::transform(rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+                    thrust::make_counting_iterator<index_t>(0),
+                    thrust::make_counting_iterator<index_t>(num_paths),
+
+                    d_start.begin(),
+                    [num_vertices] __device__(auto indx) { return indx % num_vertices; });
+}
+}  // namespace
+
+struct RandomWalks_Usecase {
+  std::string graph_file_full_path{};
+  bool test_weighted{false};
+
+  RandomWalks_Usecase(std::string const& graph_file_path, bool test_weighted)
+    : test_weighted(test_weighted)
+  {
+    if ((graph_file_path.length() > 0) && (graph_file_path[0] != '/')) {
+      graph_file_full_path = cugraph::test::get_rapids_dataset_root_dir() + "/" + graph_file_path;
+    } else {
+      graph_file_full_path = graph_file_path;
+    }
+  };
+};
+
+class Tests_RandomWalks : public ::testing::TestWithParam<RandomWalks_Usecase> {
+ public:
+  Tests_RandomWalks() {}
+  static void SetupTestCase() {}
+  static void TearDownTestCase() {}
+
+  virtual void SetUp() {}
+  virtual void TearDown() {}
+
+  template <typename vertex_t, typename edge_t, typename weight_t>
+  void run_current_test(RandomWalks_Usecase const& configuration)
+  {
+    raft::handle_t handle{};
+
+    // debuf info:
+    //
+    // std::cout << "read graph file: " << configuration.graph_file_full_path << std::endl;
+
+    cugraph::experimental::graph_t<vertex_t, edge_t, weight_t, false, false> graph(handle);
+    std::tie(graph, std::ignore) =
+      cugraph::test::read_graph_from_matrix_market_file<vertex_t, edge_t, weight_t, false, false>(
+        handle, configuration.graph_file_full_path, configuration.test_weighted, false);
+
+    auto graph_view = graph.view();
+
+    // call random_walks:
+    start_random_walks(graph_view);
+  }
+
+  template <typename graph_vt>
+  void start_random_walks(graph_vt const& graph_view)
+  {
+    using vertex_t = typename graph_vt::vertex_type;
+    using edge_t   = typename graph_vt::edge_type;
+    using weight_t = typename graph_vt::weight_type;
+
+    raft::handle_t handle{};
+    edge_t num_paths = 10;
+    rmm::device_uvector<vertex_t> d_start(num_paths, handle.get_stream());
+
+    vertex_t num_vertices = graph_view.get_number_of_vertices();
+    fill_start(handle, d_start, num_vertices);
+
+    // 0-copy const device view:
+    //
+    cugraph::experimental::detail::device_const_vector_view<vertex_t, edge_t> d_start_view{
+      d_start.data(), num_paths};
+
+    edge_t max_depth{10};
+
+    auto ret_tuple =
+      cugraph::experimental::detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+    // check results:
+    //
+    bool test_all_paths = cugraph::test::host_check_rw_paths(
+      handle, graph_view, std::get<0>(ret_tuple), std::get<1>(ret_tuple), std::get<2>(ret_tuple));
+
+    if (!test_all_paths)
+      std::cout << "starting seed on failure: " << std::get<3>(ret_tuple) << '\n';
+
+    ASSERT_TRUE(test_all_paths);
+  }
+};
+
+TEST_P(Tests_RandomWalks, Initialize_i32_i32_f)
+{
+  run_current_test<int32_t, int32_t, float>(GetParam());
+}
+
+INSTANTIATE_TEST_CASE_P(
+  simple_test,
+  Tests_RandomWalks,
+  ::testing::Values(RandomWalks_Usecase("test/datasets/karate.mtx", true),
+                    RandomWalks_Usecase("test/datasets/web-Google.mtx", true),
+                    RandomWalks_Usecase("test/datasets/ljournal-2008.mtx", true),
+                    RandomWalks_Usecase("test/datasets/webbase-1M.mtx", true)));
+
+CUGRAPH_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/experimental/random_walks_utils.cuh b/cpp/tests/experimental/random_walks_utils.cuh
new file mode 100644
index 00000000000..863094dc310
--- /dev/null
+++ b/cpp/tests/experimental/random_walks_utils.cuh
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <experimental/random_walks.cuh>
+#include <graph.hpp>
+
+#include <raft/handle.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <vector>
+
+// utilities for testing / verification of Random Walks functionality:
+//
+namespace cugraph {
+namespace test {
+
+template <typename value_t>
+using vector_test_t = cugraph::experimental::detail::device_vec_t<value_t>;  // for debug purposes
+
+// host side utility to check a if a sequence of vertices is connected:
+//
+template <typename vertex_t, typename edge_t, typename weight_t>
+bool host_check_path(std::vector<edge_t> const& row_offsets,
+                     std::vector<vertex_t> const& col_inds,
+                     std::vector<weight_t> const& values,
+                     typename std::vector<vertex_t>::const_iterator v_path_begin,
+                     typename std::vector<vertex_t>::const_iterator v_path_end,
+                     typename std::vector<weight_t>::const_iterator w_path_begin)
+{
+  bool assert1 = (row_offsets.size() > 0);
+  bool assert2 = (col_inds.size() == values.size());
+
+  vertex_t num_rows = row_offsets.size() - 1;
+  edge_t nnz        = row_offsets.back();
+
+  bool assert3 = (nnz == static_cast<edge_t>(col_inds.size()));
+  if (assert1 == false || assert2 == false || assert3 == false) {
+    std::cout << "CSR inconsistency\n";
+    return false;
+  }
+
+  auto it_w = w_path_begin;
+  for (auto it_v = v_path_begin; it_v != v_path_end - 1; ++it_v, ++it_w) {
+    auto crt_vertex  = *it_v;
+    auto next_vertex = *(it_v + 1);
+
+    auto begin      = col_inds.begin() + row_offsets[crt_vertex];
+    auto end        = col_inds.begin() + row_offsets[crt_vertex + 1];
+    auto found_next = std::find_if(
+      begin, end, [next_vertex](auto dst_vertex) { return dst_vertex == next_vertex; });
+    if (found_next == end) {
+      std::cout << "vertex not found: " << next_vertex << " as neighbor of " << crt_vertex << '\n';
+      return false;
+    }
+
+    auto delta = row_offsets[crt_vertex] + std::distance(begin, found_next);
+
+    // std::cout << "delta in ci: " << delta << '\n';
+    auto found_edge = values.begin() + delta;
+    if (*found_edge != *it_w) {
+      std::cout << "weight not found: " << *found_edge << " between " << crt_vertex << " and "
+                << next_vertex << '\n';
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename vertex_t, typename edge_t, typename weight_t, typename index_t = edge_t>
+bool host_check_rw_paths(
+  raft::handle_t const& handle,
+  cugraph::experimental::graph_view_t<vertex_t, edge_t, weight_t, false, false> const& graph_view,
+  vector_test_t<vertex_t> const& d_coalesced_v,
+  vector_test_t<weight_t> const& d_coalesced_w,
+  vector_test_t<index_t> const& d_sizes)
+{
+  edge_t num_edges      = graph_view.get_number_of_edges();
+  vertex_t num_vertices = graph_view.get_number_of_vertices();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_coalesced(d_coalesced_v.size());
+  std::vector<weight_t> w_coalesced(d_coalesced_w.size());
+  std::vector<index_t> v_sizes(d_sizes.size());
+
+  raft::update_host(v_coalesced.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_coalesced_v),
+                    d_coalesced_v.size(),
+                    handle.get_stream());
+  raft::update_host(w_coalesced.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_coalesced_w),
+                    d_coalesced_w.size(),
+                    handle.get_stream());
+  raft::update_host(v_sizes.data(),
+                    cugraph::experimental::detail::raw_const_ptr(d_sizes),
+                    d_sizes.size(),
+                    handle.get_stream());
+
+  auto it_v_begin = v_coalesced.begin();
+  auto it_w_begin = w_coalesced.begin();
+  for (auto&& crt_sz : v_sizes) {
+    auto it_v_end = it_v_begin + crt_sz;
+
+    bool test_path = host_check_path(v_ro, v_ci, v_vals, it_v_begin, it_v_end, it_w_begin);
+
+    it_v_begin = it_v_end;
+    it_w_begin += crt_sz - 1;
+
+    if (!test_path) {  // something went wrong; print to debug (since it's random)
+      raft::print_host_vector("sizes", v_sizes.data(), v_sizes.size(), std::cout);
+
+      raft::print_host_vector("coalesced v", v_coalesced.data(), v_coalesced.size(), std::cout);
+
+      raft::print_host_vector("coalesced w", w_coalesced.data(), w_coalesced.size(), std::cout);
+
+      return false;
+    }
+  }
+  return true;
+}
+
+}  // namespace test
+}  // namespace cugraph
diff --git a/cpp/tests/experimental/rw_low_level_test.cu b/cpp/tests/experimental/rw_low_level_test.cu
new file mode 100644
index 00000000000..a32e258d366
--- /dev/null
+++ b/cpp/tests/experimental/rw_low_level_test.cu
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "cuda_profiler_api.h"
+
+#include <utilities/base_fixture.hpp>
+#include <utilities/test_utilities.hpp>
+
+#include <rmm/thrust_rmm_allocator.h>
+#include <thrust/random.h>
+
+#include <algorithms.hpp>
+#include <experimental/random_walks.cuh>
+#include <graph.hpp>
+
+#include <raft/handle.hpp>
+#include <raft/random/rng.cuh>
+
+#include "random_walks_utils.cuh"
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <numeric>
+#include <utilities/high_res_timer.hpp>
+#include <vector>
+
+using namespace cugraph::experimental;
+
+template <typename value_t>
+using vector_test_t = detail::device_vec_t<value_t>;  // for debug purposes
+
+namespace {  // anonym.
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+graph_t<vertex_t, edge_t, weight_t, false, false> make_graph(raft::handle_t const& handle,
+                                                             std::vector<vertex_t> const& v_src,
+                                                             std::vector<vertex_t> const& v_dst,
+                                                             std::vector<weight_t> const& v_w,
+                                                             vertex_t num_vertices,
+                                                             edge_t num_edges)
+{
+  vector_test_t<vertex_t> d_src(num_edges, handle.get_stream());
+  vector_test_t<vertex_t> d_dst(num_edges, handle.get_stream());
+  vector_test_t<weight_t> d_weights(num_edges, handle.get_stream());
+
+  raft::update_device(d_src.data(), v_src.data(), d_src.size(), handle.get_stream());
+  raft::update_device(d_dst.data(), v_dst.data(), d_dst.size(), handle.get_stream());
+  raft::update_device(d_weights.data(), v_w.data(), d_weights.size(), handle.get_stream());
+
+  edgelist_t<vertex_t, edge_t, weight_t> edgelist{
+    d_src.data(), d_dst.data(), d_weights.data(), num_edges};
+
+  graph_t<vertex_t, edge_t, weight_t, false, false> graph(
+    handle, edgelist, num_vertices, graph_properties_t{}, false);
+
+  return graph;
+}
+
+template <typename vertex_t, typename edge_t, typename index_t>
+bool check_col_indices(raft::handle_t const& handle,
+                       vector_test_t<edge_t> const& d_crt_out_degs,
+                       vector_test_t<vertex_t> const& d_col_indx,
+                       index_t num_paths)
+{
+  bool all_indices_within_degs = thrust::all_of(
+    rmm::exec_policy(handle.get_stream())->on(handle.get_stream()),
+    thrust::make_counting_iterator<index_t>(0),
+    thrust::make_counting_iterator<index_t>(num_paths),
+    [p_d_col_indx     = detail::raw_const_ptr(d_col_indx),
+     p_d_crt_out_degs = detail::raw_const_ptr(d_crt_out_degs)] __device__(auto indx) {
+      if (p_d_crt_out_degs[indx] > 0)
+        return ((p_d_col_indx[indx] >= 0) && (p_d_col_indx[indx] < p_d_crt_out_degs[indx]));
+      else
+        return true;
+    });
+  return all_indices_within_degs;
+}
+
+}  // namespace
+
+// FIXME (per rlratzel request):
+// This test may be considered an e2e test
+// which could be moved to a different test suite:
+//
+struct RandomWalksPrimsTest : public ::testing::Test {
+};
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRWStart)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vs(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, num_vertices + 1, handle.get_stream());
+  raft::update_host(v_ci.data(), indices, num_edges, handle.get_stream());
+  raft::update_host(v_vs.data(), values, num_edges, handle.get_stream());
+
+  std::vector<edge_t> v_ro_expected{0, 1, 3, 6, 7, 8, 8};
+  std::vector<vertex_t> v_ci_expected{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_vs_expected{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  EXPECT_EQ(v_ro, v_ro_expected);
+  EXPECT_EQ(v_ci, v_ci_expected);
+  EXPECT_EQ(v_vs, v_vs_expected);
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  std::vector<vertex_t> v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1};
+  raft::update_host(
+    v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+  EXPECT_EQ(v_coalesced, v_coalesced_exp);
+
+  std::vector<index_t> v_sizes{1, 1, 1, 1};
+  std::vector<index_t> v_sz_exp(num_paths);
+  raft::update_host(v_sz_exp.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+
+  EXPECT_EQ(v_sizes, v_sz_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceExperiments)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+  EXPECT_EQ(static_cast<size_t>(num_vertices), d_out_degs.size());
+
+  std::vector<edge_t> v_out_degs(num_vertices);
+  raft::update_host(
+    v_out_degs.data(), raw_const_ptr(d_out_degs), num_vertices, handle.get_stream());
+
+  std::vector<edge_t> v_out_degs_exp{1, 2, 3, 1, 1, 0};
+  EXPECT_EQ(v_out_degs, v_out_degs_exp);
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  std::vector<edge_t> v_crt_out_degs(num_paths);
+  raft::update_host(
+    v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream());
+
+  std::vector<edge_t> v_crt_out_degs_exp{2, 1, 1, 3};
+  EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphColExtraction)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  col_indx_extract_t<decltype(graph_view), index_t> col_extractor{handle,
+                                                                  graph_view,
+                                                                  raw_const_ptr(d_crt_out_degs),
+                                                                  raw_const_ptr(d_sizes),
+                                                                  num_paths,
+                                                                  max_depth};
+
+  // typically given by random engine:
+  //
+  std::vector<vertex_t> v_col_indx{1, 0, 0, 2};
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+
+  raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(num_paths, handle.get_stream());
+  vector_test_t<weight_t> d_next_w(num_paths, handle.get_stream());
+
+  col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+  std::vector<vertex_t> v_next_v(num_paths);
+  std::vector<weight_t> v_next_w(num_paths);
+
+  raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream());
+  raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream());
+
+  std::vector<vertex_t> v_next_v_exp{4, 1, 5, 3};
+  std::vector<weight_t> v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f};
+
+  EXPECT_EQ(v_next_v, v_next_v_exp);
+  EXPECT_EQ(v_next_w, v_next_w_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRndGenColIndx)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = float;
+  using seed_t   = long;
+
+  using random_engine_t = rrandom_gen_t<vertex_t, edge_t>;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  // random engine generated:
+  //
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+  vector_test_t<real_t> d_random(num_paths, handle.get_stream());
+
+  seed_t seed = static_cast<seed_t>(std::time(nullptr));
+  random_engine_t rgen(handle, num_paths, d_random, d_crt_out_degs, seed);
+  rgen.generate_col_indices(d_col_indx);
+
+  bool all_indices_within_degs = check_col_indices(handle, d_crt_out_degs, d_col_indx, num_paths);
+
+  ASSERT_TRUE(all_indices_within_degs);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphUpdatePathSizes)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+  using real_t   = float;
+  using seed_t   = long;
+
+  using random_engine_t = rrandom_gen_t<vertex_t, edge_t>;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // Fixed  set of out-degs, as opposed to have them generated by the algorithm.
+  // That's because I want to test a certain functionality in isolation
+  //
+  std::vector<edge_t> v_crt_out_degs{2, 0, 1, 0};
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  raft::update_device(
+    d_crt_out_degs.data(), v_crt_out_degs.data(), d_crt_out_degs.size(), handle.get_stream());
+
+  rand_walker.update_path_sizes(d_crt_out_degs, d_sizes);
+
+  std::vector<index_t> v_sizes(num_paths);
+  raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+  std::vector<index_t> v_sizes_exp{2, 1, 2, 1};
+  // i.e., corresponding 0-entries in crt-out-degs, don't get updated;
+
+  EXPECT_EQ(v_sizes, v_sizes_exp);
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphScatterUpdate)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_start(num_paths, handle.get_stream());
+
+  raft::update_device(d_start.data(), v_start.data(), d_start.size(), handle.get_stream());
+
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  auto const& d_out_degs = rand_walker.get_out_degs();
+
+  rand_walker.start(d_start, d_coalesced_v, d_sizes);
+
+  // update crt_out_degs:
+  //
+  vector_test_t<edge_t> d_crt_out_degs(num_paths, handle.get_stream());
+  rand_walker.gather_from_coalesced(
+    d_coalesced_v, d_out_degs, d_sizes, d_crt_out_degs, max_depth, num_paths);
+
+  col_indx_extract_t<decltype(graph_view), index_t> col_extractor{handle,
+                                                                  graph_view,
+                                                                  raw_const_ptr(d_crt_out_degs),
+                                                                  raw_const_ptr(d_sizes),
+                                                                  num_paths,
+                                                                  max_depth};
+
+  // typically given by random engine:
+  //
+  std::vector<vertex_t> v_col_indx{1, 0, 0, 2};
+  vector_test_t<vertex_t> d_col_indx(num_paths, handle.get_stream());
+
+  raft::update_device(d_col_indx.data(), v_col_indx.data(), d_col_indx.size(), handle.get_stream());
+
+  vector_test_t<vertex_t> d_next_v(num_paths, handle.get_stream());
+  vector_test_t<weight_t> d_next_w(num_paths, handle.get_stream());
+
+  col_extractor(d_coalesced_v, d_col_indx, d_next_v, d_next_w);
+
+  rand_walker.update_path_sizes(d_crt_out_degs, d_sizes);
+
+  // check start():
+  //
+  {
+    std::vector<vertex_t> v_coalesced_exp{1, -1, -1, 0, -1, -1, 4, -1, -1, 2, -1, -1};
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+  }
+
+  // check crt_out_degs:
+  //
+  {
+    std::vector<edge_t> v_crt_out_degs(num_paths);
+    raft::update_host(
+      v_crt_out_degs.data(), raw_const_ptr(d_crt_out_degs), num_paths, handle.get_stream());
+    std::vector<edge_t> v_crt_out_degs_exp{2, 1, 1, 3};
+    EXPECT_EQ(v_crt_out_degs, v_crt_out_degs_exp);
+  }
+
+  // check paths sizes update:
+  //
+  {
+    std::vector<index_t> v_sizes(num_paths);
+    raft::update_host(v_sizes.data(), raw_const_ptr(d_sizes), num_paths, handle.get_stream());
+    std::vector<index_t> v_sizes_exp{2, 2, 2, 2};
+    // i.e., corresponding 0-entries in crt-out-degs, don't get updated;
+    EXPECT_EQ(v_sizes, v_sizes_exp);
+  }
+
+  // check next step:
+  //
+  {
+    std::vector<vertex_t> v_next_v(num_paths);
+    std::vector<weight_t> v_next_w(num_paths);
+
+    raft::update_host(v_next_v.data(), raw_const_ptr(d_next_v), num_paths, handle.get_stream());
+    raft::update_host(v_next_w.data(), raw_const_ptr(d_next_w), num_paths, handle.get_stream());
+
+    std::vector<vertex_t> v_next_v_exp{4, 1, 5, 3};
+    std::vector<weight_t> v_next_w_exp{2.1f, 0.1f, 7.1f, 5.1f};
+
+    EXPECT_EQ(v_next_v, v_next_v_exp);
+    EXPECT_EQ(v_next_w, v_next_w_exp);
+  }
+
+  rand_walker.scatter_vertices(d_next_v, d_coalesced_v, d_crt_out_degs, d_sizes);
+  rand_walker.scatter_weights(d_next_w, d_coalesced_w, d_crt_out_degs, d_sizes);
+
+  // check vertex/weight scatter:
+  //
+  {
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), total_sz, handle.get_stream());
+    raft::update_host(
+      w_coalesced.data(), raw_const_ptr(d_coalesced_w), total_sz - num_paths, handle.get_stream());
+
+    std::vector<vertex_t> v_coalesced_exp{1, 4, -1, 0, 1, -1, 4, 5, -1, 2, 3, -1};
+    std::vector<weight_t> w_coalesced_exp{2.1, -1, 0.1, -1, 7.1, -1, 5.1, -1};
+
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+    EXPECT_EQ(w_coalesced, w_coalesced_exp);
+  }
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphCoalesceDefragment)
+{
+  using namespace cugraph::experimental::detail;
+
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  index_t num_paths = 4;
+  index_t max_depth = 3;
+  index_t total_sz  = num_paths * max_depth;
+
+  std::vector<index_t> v_sizes{1, 2, 2, 1};
+  vector_test_t<index_t> d_sizes(num_paths, handle.get_stream());
+  raft::update_device(d_sizes.data(), v_sizes.data(), d_sizes.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_coalesced(total_sz, -1);
+  v_coalesced[0]                 = 3;
+  v_coalesced[max_depth]         = 5;
+  v_coalesced[max_depth + 1]     = 2;
+  v_coalesced[2 * max_depth]     = 4;
+  v_coalesced[2 * max_depth + 1] = 0;
+  v_coalesced[3 * max_depth]     = 1;
+
+  std::vector<weight_t> w_coalesced(total_sz - num_paths, -1);
+  w_coalesced[max_depth - 1]     = 10.1;
+  w_coalesced[2 * max_depth - 2] = 11.2;
+
+  vector_test_t<vertex_t> d_coalesced_v(total_sz, handle.get_stream());
+  vector_test_t<weight_t> d_coalesced_w(total_sz - num_paths, handle.get_stream());
+
+  raft::update_device(
+    d_coalesced_v.data(), v_coalesced.data(), d_coalesced_v.size(), handle.get_stream());
+  raft::update_device(
+    d_coalesced_w.data(), w_coalesced.data(), d_coalesced_w.size(), handle.get_stream());
+
+  random_walker_t<decltype(graph_view)> rand_walker{handle, graph_view, num_paths, max_depth};
+
+  rand_walker.stop(d_coalesced_v, d_coalesced_w, d_sizes);
+
+  // check vertex/weight defragment:
+  //
+  {
+    v_coalesced.resize(d_coalesced_v.size());
+    w_coalesced.resize(d_coalesced_w.size());
+
+    raft::update_host(
+      v_coalesced.data(), raw_const_ptr(d_coalesced_v), d_coalesced_v.size(), handle.get_stream());
+    raft::update_host(
+      w_coalesced.data(), raw_const_ptr(d_coalesced_w), d_coalesced_w.size(), handle.get_stream());
+
+    std::vector<vertex_t> v_coalesced_exp{3, 5, 2, 4, 0, 1};
+    std::vector<weight_t> w_coalesced_exp{10.1, 11.2};
+
+    EXPECT_EQ(v_coalesced, v_coalesced_exp);
+    EXPECT_EQ(w_coalesced, w_coalesced_exp);
+  }
+}
+
+TEST_F(RandomWalksPrimsTest, SimpleGraphRandomWalk)
+{
+  using vertex_t = int32_t;
+  using edge_t   = vertex_t;
+  using weight_t = float;
+  using index_t  = vertex_t;
+
+  raft::handle_t handle{};
+
+  edge_t num_edges      = 8;
+  vertex_t num_vertices = 6;
+
+  std::vector<vertex_t> v_src{0, 1, 1, 2, 2, 2, 3, 4};
+  std::vector<vertex_t> v_dst{1, 3, 4, 0, 1, 3, 5, 5};
+  std::vector<weight_t> v_w{0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1};
+
+  auto graph = make_graph(handle, v_src, v_dst, v_w, num_vertices, num_edges);
+
+  auto graph_view = graph.view();
+
+  edge_t const* offsets   = graph_view.offsets();
+  vertex_t const* indices = graph_view.indices();
+  weight_t const* values  = graph_view.weights();
+
+  std::vector<edge_t> v_ro(num_vertices + 1);
+  std::vector<vertex_t> v_ci(num_edges);
+  std::vector<weight_t> v_vals(num_edges);
+
+  raft::update_host(v_ro.data(), offsets, v_ro.size(), handle.get_stream());
+  raft::update_host(v_ci.data(), indices, v_ci.size(), handle.get_stream());
+  raft::update_host(v_vals.data(), values, v_vals.size(), handle.get_stream());
+
+  std::vector<vertex_t> v_start{1, 0, 4, 2};
+  vector_test_t<vertex_t> d_v_start(v_start.size(), handle.get_stream());
+  raft::update_device(d_v_start.data(), v_start.data(), d_v_start.size(), handle.get_stream());
+
+  index_t num_paths = v_start.size();
+  index_t max_depth = 5;
+
+  // 0-copy const device view:
+  //
+  detail::device_const_vector_view<vertex_t, index_t> d_start_view{d_v_start.data(), num_paths};
+  auto quad = detail::random_walks_impl(handle, graph_view, d_start_view, max_depth);
+
+  auto& d_coalesced_v = std::get<0>(quad);
+  auto& d_coalesced_w = std::get<1>(quad);
+  auto& d_sizes       = std::get<2>(quad);
+  auto seed0          = std::get<3>(quad);
+
+  bool test_all_paths =
+    cugraph::test::host_check_rw_paths(handle, graph_view, d_coalesced_v, d_coalesced_w, d_sizes);
+
+  if (!test_all_paths) std::cout << "starting seed on failure: " << seed0 << '\n';
+
+  ASSERT_TRUE(test_all_paths);
+}