From d61595091fe720bd5628e8a0d85fed5512ee2efb Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Feb 2022 13:03:57 -0500
Subject: [PATCH 01/38] Using metrics from raft

---
 cpp/src/metrics/accuracy_score.cu             |   4 +-
 cpp/src/metrics/adjusted_rand_index.cu        |   6 +-
 cpp/src/metrics/completeness_score.cu         |   4 +-
 cpp/src/metrics/entropy.cu                    |   5 +-
 cpp/src/metrics/homogeneity_score.cu          |   4 +-
 cpp/src/metrics/kl_divergence.cu              |   6 +-
 cpp/src/metrics/mutual_info_score.cu          |   4 +-
 cpp/src/metrics/r2_score.cu                   |   6 +-
 cpp/src/metrics/rand_index.cu                 |   4 +-
 cpp/src/metrics/silhouette_score.cu           |   9 +-
 cpp/src/metrics/trustworthiness.cu            |   4 +-
 cpp/src/metrics/v_measure.cu                  |   4 +-
 cpp/src_prims/metrics/adjusted_rand_index.cuh | 195 -------
 .../metrics/batched/information_criterion.cuh |  85 ----
 .../metrics/batched/silhouette_score.cuh      | 278 ----------
 cpp/src_prims/metrics/completeness_score.cuh  |  69 ---
 cpp/src_prims/metrics/contingencyMatrix.cuh   | 312 ------------
 cpp/src_prims/metrics/dispersion.cuh          | 136 -----
 cpp/src_prims/metrics/entropy.cuh             | 152 ------
 cpp/src_prims/metrics/homogeneity_score.cuh   |  70 ---
 cpp/src_prims/metrics/kl_divergence.cuh       |  83 ---
 cpp/src_prims/metrics/mutual_info_score.cuh   | 176 -------
 cpp/src_prims/metrics/rand_index.cuh          | 164 ------
 cpp/src_prims/metrics/scores.cuh              | 215 --------
 cpp/src_prims/metrics/silhouette_score.cuh    | 331 ------------
 .../metrics/trustworthiness_score.cuh         | 218 --------
 cpp/src_prims/metrics/v_measure.cuh           |  63 ---
 cpp/src_prims/selection/columnWiseSort.cuh    | 346 -------------
 cpp/src_prims/selection/haversine_knn.cuh     | 136 -----
 cpp/src_prims/selection/knn.cuh               | 348 -------------
 cpp/src_prims/selection/processing.cuh        | 226 --------
 cpp/test/CMakeLists.txt                       |  31 +-
 cpp/test/prims/adjusted_rand_index.cu         | 201 --------
 cpp/test/prims/completeness_score.cu          | 136 -----
 cpp/test/prims/contingencyMatrix.cu           | 170 -------
 cpp/test/prims/dispersion.cu                  | 125 -----
 cpp/test/prims/entropy.cu                     | 118 -----
 cpp/test/prims/homogeneity_score.cu           | 134 -----
 cpp/test/prims/kl_divergence.cu               | 105 ----
 cpp/test/prims/mutual_info_score.cu           | 163 ------
 cpp/test/prims/rand_index.cu                  | 129 -----
 cpp/test/prims/score.cu                       | 481 ------------------
 cpp/test/prims/silhouette_score.cu            | 230 ---------
 cpp/test/prims/trustworthiness.cu             | 335 ------------
 cpp/test/prims/v_measure.cu                   | 139 -----
 45 files changed, 38 insertions(+), 6122 deletions(-)
 delete mode 100644 cpp/src_prims/metrics/adjusted_rand_index.cuh
 delete mode 100644 cpp/src_prims/metrics/batched/information_criterion.cuh
 delete mode 100644 cpp/src_prims/metrics/batched/silhouette_score.cuh
 delete mode 100644 cpp/src_prims/metrics/completeness_score.cuh
 delete mode 100644 cpp/src_prims/metrics/contingencyMatrix.cuh
 delete mode 100644 cpp/src_prims/metrics/dispersion.cuh
 delete mode 100644 cpp/src_prims/metrics/entropy.cuh
 delete mode 100644 cpp/src_prims/metrics/homogeneity_score.cuh
 delete mode 100644 cpp/src_prims/metrics/kl_divergence.cuh
 delete mode 100644 cpp/src_prims/metrics/mutual_info_score.cuh
 delete mode 100644 cpp/src_prims/metrics/rand_index.cuh
 delete mode 100644 cpp/src_prims/metrics/scores.cuh
 delete mode 100644 cpp/src_prims/metrics/silhouette_score.cuh
 delete mode 100644 cpp/src_prims/metrics/trustworthiness_score.cuh
 delete mode 100644 cpp/src_prims/metrics/v_measure.cuh
 delete mode 100644 cpp/src_prims/selection/columnWiseSort.cuh
 delete mode 100644 cpp/src_prims/selection/haversine_knn.cuh
 delete mode 100644 cpp/src_prims/selection/knn.cuh
 delete mode 100644 cpp/src_prims/selection/processing.cuh
 delete mode 100644 cpp/test/prims/adjusted_rand_index.cu
 delete mode 100644 cpp/test/prims/completeness_score.cu
 delete mode 100644 cpp/test/prims/contingencyMatrix.cu
 delete mode 100644 cpp/test/prims/dispersion.cu
 delete mode 100644 cpp/test/prims/entropy.cu
 delete mode 100644 cpp/test/prims/homogeneity_score.cu
 delete mode 100644 cpp/test/prims/kl_divergence.cu
 delete mode 100644 cpp/test/prims/mutual_info_score.cu
 delete mode 100644 cpp/test/prims/rand_index.cu
 delete mode 100644 cpp/test/prims/score.cu
 delete mode 100644 cpp/test/prims/silhouette_score.cu
 delete mode 100644 cpp/test/prims/trustworthiness.cu
 delete mode 100644 cpp/test/prims/v_measure.cu

diff --git a/cpp/src/metrics/accuracy_score.cu b/cpp/src/metrics/accuracy_score.cu
index 821cdd79e9..048d4f9047 100644
--- a/cpp/src/metrics/accuracy_score.cu
+++ b/cpp/src/metrics/accuracy_score.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/scores.cuh>
+#include <raft/stats/accuracy_score.hpp>
 
 namespace ML {
 
@@ -27,7 +27,7 @@ float accuracy_score_py(const raft::handle_t& handle,
                         const int* ref_predictions,
                         int n)
 {
-  return MLCommon::Score::accuracy_score(predictions, ref_predictions, n, handle.get_stream());
+  return raft::stats::accuracy_score(predictions, ref_predictions, n, handle.get_stream());
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/adjusted_rand_index.cu b/cpp/src/metrics/adjusted_rand_index.cu
index a3a55b3a0a..f2969663d8 100644
--- a/cpp/src/metrics/adjusted_rand_index.cu
+++ b/cpp/src/metrics/adjusted_rand_index.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/adjusted_rand_index.cuh>
+#include <raft/stats/adjusted_rand_index.hpp>
 
 namespace ML {
 
@@ -26,7 +26,7 @@ double adjusted_rand_index(const raft::handle_t& handle,
                            const int64_t* y_hat,
                            const int64_t n)
 {
-  return MLCommon::Metrics::compute_adjusted_rand_index<int64_t, unsigned long long>(
+  return raft::stats::adjusted_rand_index<int64_t, unsigned long long>(
     y, y_hat, n, handle.get_stream());
 }
 
@@ -35,7 +35,7 @@ double adjusted_rand_index(const raft::handle_t& handle,
                            const int* y_hat,
                            const int n)
 {
-  return MLCommon::Metrics::compute_adjusted_rand_index<int, unsigned long long>(
+  return raft::stats::adjusted_rand_index<int, unsigned long long>(
     y, y_hat, n, handle.get_stream());
 }
 }  // namespace Metrics
diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu
index b7b95a05e7..786be71387 100644
--- a/cpp/src/metrics/completeness_score.cu
+++ b/cpp/src/metrics/completeness_score.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/homogeneity_score.cuh>
+#include <raft/stats/completeness_score.hpp>
 
 namespace ML {
 
@@ -29,7 +29,7 @@ double completeness_score(const raft::handle_t& handle,
                           const int lower_class_range,
                           const int upper_class_range)
 {
-  return MLCommon::Metrics::homogeneity_score(
+  return raft::stats::homogeneity_score(
     y_hat, y, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 
diff --git a/cpp/src/metrics/entropy.cu b/cpp/src/metrics/entropy.cu
index 1935c427aa..a82ff1df9d 100644
--- a/cpp/src/metrics/entropy.cu
+++ b/cpp/src/metrics/entropy.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/entropy.cuh>
+#include <raft/stats/entropy.hpp>
 
 namespace ML {
 
@@ -27,8 +27,7 @@ double entropy(const raft::handle_t& handle,
                const int lower_class_range,
                const int upper_class_range)
 {
-  return MLCommon::Metrics::entropy(
-    y, n, lower_class_range, upper_class_range, handle.get_stream());
+  return raft::stats::entropy(y, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/homogeneity_score.cu b/cpp/src/metrics/homogeneity_score.cu
index 3f2b231bf2..f902834821 100644
--- a/cpp/src/metrics/homogeneity_score.cu
+++ b/cpp/src/metrics/homogeneity_score.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/homogeneity_score.cuh>
+#include <raft/stats/homogeneity_score.hpp>
 
 namespace ML {
 
@@ -29,7 +29,7 @@ double homogeneity_score(const raft::handle_t& handle,
                          const int lower_class_range,
                          const int upper_class_range)
 {
-  return MLCommon::Metrics::homogeneity_score(
+  return raft::stats::homogeneity_score(
     y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 }  // namespace Metrics
diff --git a/cpp/src/metrics/kl_divergence.cu b/cpp/src/metrics/kl_divergence.cu
index f4c9ad6047..7e80f01c6a 100644
--- a/cpp/src/metrics/kl_divergence.cu
+++ b/cpp/src/metrics/kl_divergence.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/kl_divergence.cuh>
+#include <raft/stats/kl_divergence.hpp>
 
 namespace ML {
 
@@ -24,12 +24,12 @@ namespace Metrics {
 
 double kl_divergence(const raft::handle_t& handle, const double* y, const double* y_hat, int n)
 {
-  return MLCommon::Metrics::kl_divergence(y, y_hat, n, handle.get_stream());
+  return raft::stats::kl_divergence(y, y_hat, n, handle.get_stream());
 }
 
 float kl_divergence(const raft::handle_t& handle, const float* y, const float* y_hat, int n)
 {
-  return MLCommon::Metrics::kl_divergence(y, y_hat, n, handle.get_stream());
+  return raft::stats::kl_divergence(y, y_hat, n, handle.get_stream());
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/mutual_info_score.cu b/cpp/src/metrics/mutual_info_score.cu
index 1c2cf4c2a3..3b98654907 100644
--- a/cpp/src/metrics/mutual_info_score.cu
+++ b/cpp/src/metrics/mutual_info_score.cu
@@ -18,7 +18,7 @@
 #include <raft/handle.hpp>
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/mutual_info_score.cuh>
+#include <raft/stats/mutual_info_score.hpp>
 
 namespace ML {
 
@@ -31,7 +31,7 @@ double mutual_info_score(const raft::handle_t& handle,
                          const int lower_class_range,
                          const int upper_class_range)
 {
-  return MLCommon::Metrics::mutual_info_score(
+  return raft::stats::mutual_info_score(
     y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 
diff --git a/cpp/src/metrics/r2_score.cu b/cpp/src/metrics/r2_score.cu
index 402f8e8606..ce3f99fb02 100644
--- a/cpp/src/metrics/r2_score.cu
+++ b/cpp/src/metrics/r2_score.cu
@@ -15,7 +15,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/scores.cuh>
+#include <raft/stats/r2_score.hpp>
 
 namespace ML {
 
@@ -23,12 +23,12 @@ namespace Metrics {
 
 float r2_score_py(const raft::handle_t& handle, float* y, float* y_hat, int n)
 {
-  return MLCommon::Score::r2_score(y, y_hat, n, handle.get_stream());
+  return raft::stats::r2_score(y, y_hat, n, handle.get_stream());
 }
 
 double r2_score_py(const raft::handle_t& handle, double* y, double* y_hat, int n)
 {
-  return MLCommon::Score::r2_score(y, y_hat, n, handle.get_stream());
+  return raft::stats::r2_score(y, y_hat, n, handle.get_stream());
 }
 
 }  // namespace Metrics
diff --git a/cpp/src/metrics/rand_index.cu b/cpp/src/metrics/rand_index.cu
index 021b0e1b28..8cc4af3ff8 100644
--- a/cpp/src/metrics/rand_index.cu
+++ b/cpp/src/metrics/rand_index.cu
@@ -18,7 +18,7 @@
 #include <raft/handle.hpp>
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/rand_index.cuh>
+#include <raft/stats/rand_index.hpp>
 
 namespace ML {
 
@@ -26,7 +26,7 @@ namespace Metrics {
 
 double rand_index(const raft::handle_t& handle, const double* y, const double* y_hat, int n)
 {
-  return MLCommon::Metrics::compute_rand_index(y, y_hat, (uint64_t)n, handle.get_stream());
+  return raft::stats::rand_index(y, y_hat, (uint64_t)n, handle.get_stream());
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/silhouette_score.cu b/cpp/src/metrics/silhouette_score.cu
index c80fe099f1..d9c4812274 100644
--- a/cpp/src/metrics/silhouette_score.cu
+++ b/cpp/src/metrics/silhouette_score.cu
@@ -16,9 +16,8 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/batched/silhouette_score.cuh>
-#include <metrics/silhouette_score.cuh>
 #include <raft/distance/distance_type.hpp>
+#include <raft/stats/silhouette_score.hpp>
 
 namespace ML {
 
@@ -32,7 +31,7 @@ double silhouette_score(const raft::handle_t& handle,
                         double* silScores,
                         raft::distance::DistanceType metric)
 {
-  return MLCommon::Metrics::silhouette_score<double, int>(
+  return raft::stats::silhouette_score<double, int>(
     handle, y, nRows, nCols, labels, nLabels, silScores, handle.get_stream(), metric);
 }
 
@@ -48,7 +47,7 @@ float silhouette_score(const raft::handle_t& handle,
                        int chunk,
                        raft::distance::DistanceType metric)
 {
-  return MLCommon::Metrics::Batched::silhouette_score<float, int, int>(
+  return raft::stats::Batched::silhouette_score_batched<float, int, int>(
     handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
 }
 
@@ -62,7 +61,7 @@ double silhouette_score(const raft::handle_t& handle,
                         int chunk,
                         raft::distance::DistanceType metric)
 {
-  return MLCommon::Metrics::Batched::silhouette_score<double, int, int>(
+  return raft::stats::silhouette_score_batched<double, int, int>(
     handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
 }
 
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index e97776df47..c3c4a644df 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <metrics/trustworthiness_score.cuh>
+#include <raft/stats/trustworthiness_score.hpp>
 
 #include <cuml/metrics/metrics.hpp>
 
@@ -50,7 +50,7 @@ double trustworthiness_score(const raft::handle_t& h,
                              int n_neighbors,
                              int batchSize)
 {
-  return MLCommon::Score::trustworthiness_score<math_t, distance_type>(
+  return raft::stats::trustworthiness_score<math_t, distance_type>(
     h, X, X_embedded, n, m, d, n_neighbors, batchSize);
 }
 
diff --git a/cpp/src/metrics/v_measure.cu b/cpp/src/metrics/v_measure.cu
index f71091543a..a979e988fc 100644
--- a/cpp/src/metrics/v_measure.cu
+++ b/cpp/src/metrics/v_measure.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <metrics/v_measure.cuh>
+#include <raft/stats/v_measure.hpp>
 
 namespace ML {
 
@@ -29,7 +29,7 @@ double v_measure(const raft::handle_t& handle,
                  const int lower_class_range,
                  const int upper_class_range)
 {
-  return MLCommon::Metrics::v_measure(
+  return raft::stats::v_measure(
     y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 }  // namespace Metrics
diff --git a/cpp/src_prims/metrics/adjusted_rand_index.cuh b/cpp/src_prims/metrics/adjusted_rand_index.cuh
deleted file mode 100644
index 3f0780e656..0000000000
--- a/cpp/src_prims/metrics/adjusted_rand_index.cuh
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file adjusted_rand_index.cuh
- * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index.
- * Such a correction for chance establishes a baseline by using the expected similarity
- * of all pair-wise comparisons between clusterings specified by a random model.
- */
-
-#pragma once
-
-#include "contingencyMatrix.cuh"
-#include <cub/cub.cuh>
-#include <math.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/stats/histogram.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-/**
- * @brief Lambda to calculate the number of unordered pairs in a given input
- *
- * @tparam Type: Data type of the input
- * @param in: the input to the functional mapping
- * @param i: the indexing(not used in this case)
- */
-template <typename Type>
-struct nCTwo {
-  HDI Type operator()(Type in, int i = 0)
-  {
-    return in % 2 ? ((in - 1) >> 1) * in : (in >> 1) * (in - 1);
-  }
-};
-
-template <typename DataT, typename IdxT>
-struct Binner {
-  Binner(DataT minL) : minLabel(minL) {}
-
-  DI int operator()(DataT val, IdxT row, IdxT col) { return int(val - minLabel); }
-
- private:
-  DataT minLabel;
-};  // struct Binner
-
-/**
- * @brief Function to count the number of unique elements in the input array
- *
- * @tparam T data-type for input arrays
- *
- * @param[in]  arr       input array [on device] [len = size]
- * @param[in]  size      the size of the input array
- * @param[out] minLabel  the lower bound of the range of labels
- * @param[out] maxLabel  the upper bound of the range of labels
- * @param[in]  stream    cuda stream
- *
- * @return the number of unique elements in the array
- */
-template <typename T>
-int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t stream)
-{
-  auto ptr         = thrust::device_pointer_cast(arr);
-  auto minmax      = thrust::minmax_element(thrust::cuda::par.on(stream), ptr, ptr + size);
-  minLabel         = *minmax.first;
-  maxLabel         = *minmax.second;
-  auto totalLabels = int(maxLabel - minLabel + 1);
-  rmm::device_uvector<int> labelCounts(totalLabels, stream);
-  rmm::device_scalar<int> nUniq(stream);
-  raft::stats::histogram<T, int>(
-    raft::stats::HistTypeAuto,
-    labelCounts.data(),
-    totalLabels,
-    arr,
-    size,
-    1,
-    stream,
-    [minLabel] __device__(T val, int row, int col) { return int(val - minLabel); });
-  raft::linalg::mapThenSumReduce<int>(
-    nUniq.data(),
-    totalLabels,
-    [] __device__(const T& val) { return val != 0; },
-    stream,
-    labelCounts.data());
-  auto numUniques = nUniq.value(stream);
-  return numUniques;
-}
-
-/**
- * @brief Function to calculate Adjusted RandIndex as described
- *        <a href="https://en.wikipedia.org/wiki/Rand_index">here</a>
- * @tparam T data-type for input label arrays
- * @tparam MathT integral data-type used for computing n-choose-r
- * @param firstClusterArray: the array of classes
- * @param secondClusterArray: the array of classes
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
- */
-template <typename T, typename MathT = int>
-double compute_adjusted_rand_index(const T* firstClusterArray,
-                                   const T* secondClusterArray,
-                                   int size,
-                                   cudaStream_t stream)
-{
-  ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
-  T minFirst, maxFirst, minSecond, maxSecond;
-  auto nUniqFirst      = countUnique(firstClusterArray, size, minFirst, maxFirst, stream);
-  auto nUniqSecond     = countUnique(secondClusterArray, size, minSecond, maxSecond, stream);
-  auto lowerLabelRange = std::min(minFirst, minSecond);
-  auto upperLabelRange = std::max(maxFirst, maxSecond);
-  auto nClasses        = upperLabelRange - lowerLabelRange + 1;
-  // degenerate case of single cluster or clusters each with just one element
-  if (nUniqFirst == nUniqSecond) {
-    if (nUniqFirst == 1 || nUniqFirst == size) return 1.0;
-  }
-  auto nUniqClasses = MathT(nClasses);
-  rmm::device_uvector<MathT> dContingencyMatrix(nUniqClasses * nUniqClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream));
-  auto workspaceSz = getContingencyMatrixWorkspaceSize<T, MathT>(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
-  rmm::device_uvector<char> workspaceBuff(workspaceSz, stream);
-  contingencyMatrix<T, MathT>(firstClusterArray,
-                              secondClusterArray,
-                              size,
-                              dContingencyMatrix.data(),
-                              stream,
-                              workspaceBuff.data(),
-                              workspaceSz,
-                              lowerLabelRange,
-                              upperLabelRange);
-  rmm::device_uvector<MathT> a(nUniqClasses, stream);
-  rmm::device_uvector<MathT> b(nUniqClasses, stream);
-  rmm::device_scalar<MathT> d_aCTwoSum(stream);
-  rmm::device_scalar<MathT> d_bCTwoSum(stream);
-  rmm::device_scalar<MathT> d_nChooseTwoSum(stream);
-  MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum;
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream));
-  // calculating the sum of NijC2
-  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(d_nChooseTwoSum.data(),
-                                                      nUniqClasses * nUniqClasses,
-                                                      nCTwo<MathT>(),
-                                                      stream,
-                                                      dContingencyMatrix.data(),
-                                                      dContingencyMatrix.data());
-  // calculating the row-wise sums
-  raft::linalg::reduce<MathT, MathT>(
-    a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, true, stream);
-  // calculating the column-wise sums
-  raft::linalg::reduce<MathT, MathT>(
-    b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, false, stream);
-  // calculating the sum of number of unordered pairs for every element in a
-  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
-    d_aCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, a.data(), a.data());
-  // calculating the sum of number of unordered pairs for every element of b
-  raft::linalg::mapThenSumReduce<MathT, nCTwo<MathT>>(
-    d_bCTwoSum.data(), nUniqClasses, nCTwo<MathT>(), stream, b.data(), b.data());
-  // updating in the host memory
-  raft::update_host(&h_nChooseTwoSum, d_nChooseTwoSum.data(), 1, stream);
-  raft::update_host(&h_aCTwoSum, d_aCTwoSum.data(), 1, stream);
-  raft::update_host(&h_bCTwoSum, d_bCTwoSum.data(), 1, stream);
-  // calculating the ARI
-  auto nChooseTwo    = double(size) * double(size - 1) / 2.0;
-  auto expectedIndex = double(h_aCTwoSum) * double(h_bCTwoSum) / double(nChooseTwo);
-  auto maxIndex      = (double(h_bCTwoSum) + double(h_aCTwoSum)) / 2.0;
-  auto index         = double(h_nChooseTwoSum);
-  if (maxIndex - expectedIndex)
-    return (index - expectedIndex) / (maxIndex - expectedIndex);
-  else
-    return 0;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/batched/information_criterion.cuh b/cpp/src_prims/metrics/batched/information_criterion.cuh
deleted file mode 100644
index 8770246f07..0000000000
--- a/cpp/src_prims/metrics/batched/information_criterion.cuh
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file information_criterion.cuh
- * @brief These information criteria are used to evaluate the quality of models
- *        by balancing the quality of the fit and the number of parameters.
- *
- * See:
- *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
- *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
- *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
- */
-
-#include <raft/linalg/unary_op.hpp>
-
-#include <cmath>
-
-namespace MLCommon {
-namespace Metrics {
-
-/// Supported types of information criteria
-enum IC_Type { AIC, AICc, BIC };
-
-namespace Batched {
-
-/**
- * Compute the given type of information criterion
- *
- * @note: it is safe to do the computation in-place (i.e give same pointer
- *        as input and output)
- *
- * @param[out] d_ic             Information criterion to be returned for each
- *                              series (device)
- * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
- * @param[in]  ic_type          Type of criterion to compute. See IC_Type
- * @param[in]  n_params         Number of parameters in the model
- * @param[in]  batch_size       Number of series in the batch
- * @param[in]  n_samples        Number of samples in each series
- * @param[in]  stream           CUDA stream
- */
-template <typename ScalarT, typename IdxT>
-void information_criterion(ScalarT* d_ic,
-                           const ScalarT* d_loglikelihood,
-                           IC_Type ic_type,
-                           IdxT n_params,
-                           IdxT batch_size,
-                           IdxT n_samples,
-                           cudaStream_t stream)
-{
-  ScalarT ic_base{};
-  ScalarT N = static_cast<ScalarT>(n_params);
-  ScalarT T = static_cast<ScalarT>(n_samples);
-  switch (ic_type) {
-    case AIC: ic_base = (ScalarT)2.0 * N; break;
-    case AICc:
-      ic_base = (ScalarT)2.0 * (N + (N * (N + (ScalarT)1.0)) / (T - N - (ScalarT)1.0));
-      break;
-    case BIC: ic_base = std::log(T) * N; break;
-  }
-  /* Compute information criterion from log-likelihood and base term */
-  raft::linalg::unaryOp(
-    d_ic,
-    d_loglikelihood,
-    batch_size,
-    [=] __device__(ScalarT loglike) { return ic_base - (ScalarT)2.0 * loglike; },
-    stream);
-}
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/batched/silhouette_score.cuh b/cpp/src_prims/metrics/batched/silhouette_score.cuh
deleted file mode 100644
index 813393ce6b..0000000000
--- a/cpp/src_prims/metrics/batched/silhouette_score.cuh
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../silhouette_score.cuh"
-#include <cuml/metrics/metrics.hpp>
-
-#include <raft/cuda_utils.cuh>
-#include <raft/device_atomics.cuh>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/device_vector.h>
-
-namespace MLCommon {
-namespace Metrics {
-namespace Batched {
-
-namespace detail {
-
-/**
- * This kernel initializes matrix b (n_rows * n_labels)
- * For each label that the corresponding row is not a part of is initialized as 0
- * If the corresponding row is the only sample in its label, again 0
- * Only if the there are > 1 samples in the label, row is initialized to max
- */
-template <typename value_t, typename value_idx, typename label_idx>
-__global__ void fill_b_kernel(value_t* b,
-                              const label_idx* y,
-                              value_idx n_rows,
-                              label_idx n_labels,
-                              const value_idx* cluster_counts)
-{
-  value_idx idx = threadIdx.x + blockIdx.x * blockDim.x;
-  label_idx idy = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (idx >= n_rows || idy >= n_labels) { return; }
-
-  auto row_cluster = y[idx];
-
-  auto col_cluster_count = cluster_counts[idy];
-
-  // b for own cluster should be max value
-  // so that it does not interfere with min operator
-  // b is also max if col cluster count is 0
-  // however, b is 0 if self cluster count is 1
-  if (row_cluster == idy || col_cluster_count == 0) {
-    if (cluster_counts[row_cluster] == 1) {
-      b[idx * n_labels + idy] = 0;
-    } else {
-      b[idx * n_labels + idy] = std::numeric_limits<value_t>::max();
-    }
-  } else {
-    b[idx * n_labels + idy] = 0;
-  }
-}
-
-/**
- * This kernel does an elementwise sweep of chunked pairwise distance matrix
- * By knowing the offsets of the chunked pairwise distance matrix in the
- * global pairwise distance matrix, we are able to calculate
- * intermediate values of a and b for the rows and columns present in the
- * current chunked pairwise distance matrix.
- */
-template <typename value_t, typename value_idx, typename label_idx>
-__global__ void compute_chunked_a_b_kernel(value_t* a,
-                                           value_t* b,
-                                           value_idx row_offset,
-                                           value_idx col_offset,
-                                           const label_idx* y,
-                                           label_idx n_labels,
-                                           const value_idx* cluster_counts,
-                                           const value_t* distances,
-                                           value_idx dist_rows,
-                                           value_idx dist_cols)
-{
-  value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x;
-  value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y;
-
-  // these are global offsets of current element
-  // in the full pairwise distance matrix
-  value_idx pw_row_id = row_id + row_offset;
-  value_idx pw_col_id = col_id + col_offset;
-
-  if (row_id >= dist_rows || col_id >= dist_cols || pw_row_id == pw_col_id) { return; }
-
-  auto row_cluster = y[pw_row_id];
-  if (cluster_counts[row_cluster] == 1) { return; }
-
-  auto col_cluster        = y[pw_col_id];
-  auto col_cluster_counts = cluster_counts[col_cluster];
-
-  if (col_cluster == row_cluster) {
-    atomicAdd(&a[pw_row_id], distances[row_id * dist_cols + col_id] / (col_cluster_counts - 1));
-  } else {
-    atomicAdd(&b[pw_row_id * n_labels + col_cluster],
-              distances[row_id * dist_cols + col_id] / col_cluster_counts);
-  }
-}
-
-}  // namespace detail
-
-template <typename value_idx, typename label_idx>
-rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
-                                                  label_idx* y,
-                                                  value_idx& n_rows,
-                                                  label_idx& n_labels)
-{
-  auto stream = handle.get_stream();
-
-  rmm::device_uvector<value_idx> cluster_counts(n_labels, stream);
-
-  rmm::device_uvector<char> workspace(1, stream);
-
-  MLCommon::Metrics::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream);
-
-  return cluster_counts;
-}
-
-template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
-                                                   value_t* left_begin,
-                                                   value_t* right_begin,
-                                                   value_idx& n_left_rows,
-                                                   value_idx& n_right_rows,
-                                                   value_idx& n_cols,
-                                                   raft::distance::DistanceType metric,
-                                                   cudaStream_t stream)
-{
-  rmm::device_uvector<value_t> distances(n_left_rows * n_right_rows, stream);
-
-  ML::Metrics::pairwise_distance(
-    handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric);
-
-  return distances;
-}
-
-template <typename value_t, typename value_idx, typename label_idx>
-void compute_chunked_a_b(const raft::handle_t& handle,
-                         value_t* a,
-                         value_t* b,
-                         value_idx& row_offset,
-                         value_idx& col_offset,
-                         const label_idx* y,
-                         label_idx& n_labels,
-                         const value_idx* cluster_counts,
-                         const value_t* distances,
-                         value_idx& dist_rows,
-                         value_idx& dist_cols,
-                         cudaStream_t stream)
-{
-  dim3 block_size(std::min(dist_rows, 32), std::min(dist_cols, 32));
-  dim3 grid_size(raft::ceildiv(dist_rows, (value_idx)block_size.x),
-                 raft::ceildiv(dist_cols, (value_idx)block_size.y));
-
-  detail::compute_chunked_a_b_kernel<<<grid_size, block_size, 0, stream>>>(
-    a, b, row_offset, col_offset, y, n_labels, cluster_counts, distances, dist_rows, dist_cols);
-}
-
-template <typename value_t, typename value_idx, typename label_idx>
-value_t silhouette_score(
-  const raft::handle_t& handle,
-  value_t* X,
-  value_idx n_rows,
-  value_idx n_cols,
-  label_idx* y,
-  label_idx n_labels,
-  value_t* scores,
-  value_idx chunk,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1),
-         "silhouette Score not defined for the given number of labels!");
-
-  rmm::device_uvector<value_idx> cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels);
-
-  auto stream = handle.get_stream();
-  auto policy = handle.get_thrust_policy();
-
-  auto b_size = n_rows * n_labels;
-
-  value_t *a_ptr, *b_ptr;
-  rmm::device_uvector<value_t> a(0, stream);
-  rmm::device_uvector<value_t> b(b_size, stream);
-
-  b_ptr = b.data();
-
-  // since a and silhouette score per sample are same size, reusing
-  if (scores == nullptr || scores == NULL) {
-    a.resize(n_rows, stream);
-    a_ptr = a.data();
-  } else {
-    a_ptr = scores;
-  }
-
-  thrust::fill(policy, a_ptr, a_ptr + n_rows, 0);
-
-  dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32));
-  dim3 grid_size(raft::ceildiv(n_rows, (value_idx)block_size.x),
-                 raft::ceildiv(n_labels, (label_idx)block_size.y));
-  detail::fill_b_kernel<<<grid_size, block_size, 0, stream>>>(
-    b_ptr, y, n_rows, n_labels, cluster_counts.data());
-
-  handle.wait_stream_pool_on_stream();
-
-  auto n_iters = 0;
-
-  for (value_idx i = 0; i < n_rows; i += chunk) {
-    for (value_idx j = 0; j < n_rows; j += chunk) {
-      ++n_iters;
-
-      auto chunk_stream = handle.get_next_usable_stream(i + chunk * j);
-
-      auto* left_begin  = X + (i * n_cols);
-      auto* right_begin = X + (j * n_cols);
-
-      auto n_left_rows  = (i + chunk) < n_rows ? chunk : (n_rows - i);
-      auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
-
-      rmm::device_uvector<value_t> distances = get_pairwise_distance(
-        handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream);
-
-      compute_chunked_a_b(handle,
-                          a_ptr,
-                          b_ptr,
-                          i,
-                          j,
-                          y,
-                          n_labels,
-                          cluster_counts.data(),
-                          distances.data(),
-                          n_left_rows,
-                          n_right_rows,
-                          chunk_stream);
-    }
-  }
-
-  handle.sync_stream_pool();
-
-  // calculating row-wise minimum in b
-  // this prim only supports int indices for now
-  raft::linalg::
-    reduce<value_t, value_t, value_idx, raft::Nop<value_t>, MLCommon::Metrics::MinOp<value_t>>(
-      b_ptr,
-      b_ptr,
-      n_labels,
-      n_rows,
-      std::numeric_limits<value_t>::max(),
-      true,
-      true,
-      stream,
-      false,
-      raft::Nop<value_t>(),
-      MLCommon::Metrics::MinOp<value_t>());
-
-  // calculating the silhouette score per sample
-  raft::linalg::binaryOp<value_t, MLCommon::Metrics::SilOp<value_t>, value_t, value_idx>(
-    a_ptr, a_ptr, b_ptr, n_rows, MLCommon::Metrics::SilOp<value_t>(), stream);
-
-  return thrust::reduce(policy, a_ptr, a_ptr + n_rows, value_t(0)) / n_rows;
-}
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/completeness_score.cuh b/cpp/src_prims/metrics/completeness_score.cuh
deleted file mode 100644
index d5805edc64..0000000000
--- a/cpp/src_prims/metrics/completeness_score.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file completeness_score.cuh
- *
- * @brief A clustering result satisfies completeness if all the data points
- * that are members of a given class are elements of the same cluster.
- */
-
-#pragma once
-
-#include "entropy.cuh"
-#include "mutual_info_score.cuh"
-
-namespace MLCommon {
-namespace Metrics {
-
-/**
- * @brief Function to calculate the completeness score between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double completeness_score(const T* truthClusterArray,
-                          const T* predClusterArray,
-                          int size,
-                          T lowerLabelRange,
-                          T upperLabelRange,
-                          cudaStream_t stream)
-{
-  if (size == 0) return 1.0;
-
-  double computedMI, computedEntropy;
-
-  computedMI = MLCommon::Metrics::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedEntropy =
-    MLCommon::Metrics::entropy(predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-
-  double completeness;
-
-  if (computedEntropy) {
-    completeness = computedMI / computedEntropy;
-  } else
-    completeness = 1.0;
-
-  return completeness;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/contingencyMatrix.cuh b/cpp/src_prims/metrics/contingencyMatrix.cuh
deleted file mode 100644
index 9fc0526565..0000000000
--- a/cpp/src_prims/metrics/contingencyMatrix.cuh
+++ /dev/null
@@ -1,312 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-
-#include <cub/cub.cuh>
-
-#include <math.h>
-
-namespace MLCommon {
-namespace Metrics {
-
-typedef enum {
-  IMPL_NONE,
-  SMEM_ATOMICS,
-  GLOBAL_ATOMICS,
-  SORT_AND_GATOMICS
-} ContingencyMatrixImplType;
-
-template <typename T, typename OutT = int>
-__global__ void devConstructContingencyMatrix(const T* groundTruth,
-                                              const T* predicted,
-                                              int nSamples,
-                                              OutT* outMat,
-                                              int outIdxOffset,
-                                              int outMatWidth)
-{
-  int elementId = threadIdx.x + blockDim.x * blockIdx.x;
-  if (elementId < nSamples) {
-    T gt           = groundTruth[elementId];
-    T pd           = predicted[elementId];
-    auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset;
-    raft::myAtomicAdd(outMat + outputIdx, OutT(1));
-  }
-}
-
-template <typename T, typename OutT = int>
-void computeCMatWAtomics(const T* groundTruth,
-                         const T* predictedLabel,
-                         int nSamples,
-                         OutT* outMat,
-                         int outIdxOffset,
-                         int outDimN,
-                         cudaStream_t stream)
-{
-  RAFT_CUDA_TRY(
-    cudaFuncSetCacheConfig(devConstructContingencyMatrix<T, OutT>, cudaFuncCachePreferL1));
-  static const int block = 128;
-  auto grid              = raft::ceildiv(nSamples, block);
-  devConstructContingencyMatrix<T, OutT><<<grid, block, 0, stream>>>(
-    groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename T, typename OutT = int>
-__global__ void devConstructContingencyMatrixSmem(const T* groundTruth,
-                                                  const T* predicted,
-                                                  int nSamples,
-                                                  OutT* outMat,
-                                                  int outIdxOffset,
-                                                  int outMatWidth)
-{
-  extern __shared__ char smem[];
-  auto* sMemMatrix = reinterpret_cast<OutT*>(smem);
-  for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) {
-    sMemMatrix[smemIdx] = 0;
-  }
-  __syncthreads();
-  int elementId = threadIdx.x + blockDim.x * blockIdx.x;
-  if (elementId < nSamples) {
-    T gt           = groundTruth[elementId];
-    T pd           = predicted[elementId];
-    auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset;
-    raft::myAtomicAdd(sMemMatrix + outputIdx, OutT(1));
-  }
-  __syncthreads();
-  for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) {
-    raft::myAtomicAdd(outMat + smemIdx, sMemMatrix[smemIdx]);
-  }
-}
-
-template <typename T, typename OutT = int>
-void computeCMatWSmemAtomics(const T* groundTruth,
-                             const T* predictedLabel,
-                             int nSamples,
-                             OutT* outMat,
-                             int outIdxOffset,
-                             int outDimN,
-                             cudaStream_t stream)
-{
-  static const int block  = 128;
-  auto grid               = raft::ceildiv(nSamples, block);
-  size_t smemSizePerBlock = outDimN * outDimN * sizeof(OutT);
-  devConstructContingencyMatrixSmem<T, OutT><<<grid, block, smemSizePerBlock, stream>>>(
-    groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN);
-  RAFT_CUDA_TRY(cudaGetLastError());
-}
-
-template <typename T, typename OutT = int>
-void contingencyMatrixWSort(const T* groundTruth,
-                            const T* predictedLabel,
-                            int nSamples,
-                            OutT* outMat,
-                            T minLabel,
-                            T maxLabel,
-                            void* workspace,
-                            size_t workspaceSize,
-                            cudaStream_t stream)
-{
-  T* outKeys           = reinterpret_cast<T*>(workspace);
-  auto alignedBufferSz = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
-  T* outValue          = reinterpret_cast<T*>((size_t)workspace + alignedBufferSz);
-  void* pWorkspaceCub  = reinterpret_cast<void*>((size_t)workspace + 2 * alignedBufferSz);
-  auto bitsToSort      = log2<int>(maxLabel);
-  if (!raft::isPo2(maxLabel)) ++bitsToSort;
-  // we dont really need perfect sorting, should get by with some sort of
-  // binning-reordering operation
-  ///@todo: future work - explore "efficient" custom binning kernels vs cub sort
-  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(pWorkspaceCub,
-                                                workspaceSize,
-                                                groundTruth,
-                                                outKeys,
-                                                predictedLabel,
-                                                outValue,
-                                                nSamples,
-                                                0,
-                                                bitsToSort,
-                                                stream));
-  auto outDimM_N = int(maxLabel - minLabel + 1);
-  computeCMatWAtomics<T, OutT>(outKeys, outValue, nSamples, outMat, minLabel, outDimM_N, stream);
-}
-
-template <typename OutT = int>
-ContingencyMatrixImplType getImplVersion(OutT outDimN)
-{
-  int currDevice  = 0;
-  int l2CacheSize = 0;
-  // no way to query this from CUDA APIs, value for CC 7.0, 3.0
-  int maxBlocksResidentPerSM = 16;
-  RAFT_CUDA_TRY(cudaGetDevice(&currDevice));
-  RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2CacheSize, cudaDevAttrL2CacheSize, currDevice));
-  auto maxSmemPerBlock                  = raft::getSharedMemPerBlock();
-  ContingencyMatrixImplType implVersion = IMPL_NONE;
-  // keeping 8 block per SM to get good utilization
-  // can go higher but reduced L1 size degrades perf
-  OutT upperLimitSmemAtomics =
-    std::floor(std::sqrt(maxSmemPerBlock / (sizeof(OutT) * (maxBlocksResidentPerSM / 2))));
-  OutT upperLimitL2Atomics = std::floor(std::sqrt(l2CacheSize / sizeof(OutT)));
-  if (outDimN <= upperLimitSmemAtomics)
-    implVersion = SMEM_ATOMICS;
-  else if (outDimN <= upperLimitL2Atomics)
-    implVersion = GLOBAL_ATOMICS;
-  else
-    implVersion = SORT_AND_GATOMICS;
-  return implVersion;
-}
-
-/**
- * @brief use this to allocate output matrix size
- * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int)
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param nSamples: number of elements in input array
- * @param stream: cuda stream for execution
- * @param minLabel: [out] calculated min value in input array
- * @param maxLabel: [out] calculated max value in input array
- */
-template <typename T>
-void getInputClassCardinality(
-  const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel)
-{
-  thrust::device_ptr<const T> dTrueLabel = thrust::device_pointer_cast(groundTruth);
-  auto min_max =
-    thrust::minmax_element(thrust::cuda::par.on(stream), dTrueLabel, dTrueLabel + nSamples);
-  minLabel = *min_max.first;
-  maxLabel = *min_max.second;
-}
-
-/**
- * @brief Calculate workspace size for running contingency matrix calculations
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param nSamples: number of elements in input array
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param stream: cuda stream for execution
- * @param minLabel: Optional, min value in input array
- * @param maxLabel: Optional, max value in input array
- */
-template <typename T, typename OutT = int>
-size_t getContingencyMatrixWorkspaceSize(int nSamples,
-                                         const T* groundTruth,
-                                         cudaStream_t stream,
-                                         T minLabel = std::numeric_limits<T>::max(),
-                                         T maxLabel = std::numeric_limits<T>::max())
-{
-  size_t workspaceSize = 0;
-  // below is a redundant computation - can be avoided
-  if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
-    getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
-  }
-  auto outDimN                          = OutT(maxLabel - minLabel + 1);
-  ContingencyMatrixImplType implVersion = getImplVersion<OutT>(outDimN);
-  if (implVersion == SORT_AND_GATOMICS) {
-    void* pWorkspaceCub{};
-    size_t tmpStorageBytes = 0;
-    // no-op pointers to get workspace size
-    T* pTmpUnused{};
-    RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
-      pWorkspaceCub, tmpStorageBytes, pTmpUnused, pTmpUnused, pTmpUnused, pTmpUnused, nSamples));
-    auto tmpStagingMemorySize = raft::alignTo<size_t>(nSamples * sizeof(T), 256);
-    tmpStagingMemorySize *= 2;
-    workspaceSize = tmpStagingMemorySize + tmpStorageBytes;
-  }
-  return workspaceSize;
-}
-
-/**
- * @brief contruct contingency matrix given input ground truth and prediction
- *        labels. Users should call function getInputClassCardinality to find
- *        and allocate memory for output. Similarly workspace requirements
- *        should be checked using function getContingencyMatrixWorkspaceSize
- * @tparam T label type
- * @tparam OutT output matrix type
- * @param groundTruth: device 1-d array for ground truth (num of rows)
- * @param predictedLabel: device 1-d array for prediction (num of columns)
- * @param nSamples: number of elements in input array
- * @param outMat: output buffer for contingecy matrix
- * @param stream: cuda stream for execution
- * @param workspace: Optional, workspace memory allocation
- * @param workspaceSize: Optional, size of workspace memory
- * @param minLabel: Optional, min value in input ground truth array
- * @param maxLabel: Optional, max value in input ground truth array
- */
-template <typename T, typename OutT = int>
-void contingencyMatrix(const T* groundTruth,
-                       const T* predictedLabel,
-                       int nSamples,
-                       OutT* outMat,
-                       cudaStream_t stream,
-                       void* workspace      = nullptr,
-                       size_t workspaceSize = 0,
-                       T minLabel           = std::numeric_limits<T>::max(),
-                       T maxLabel           = std::numeric_limits<T>::max())
-{
-  // assumptions:
-  // output is not at par with scikit learn - output will be square matrix
-  // always with numRows = numColumns = numOfClassesInTrueLabel
-  // it is also assumed that true labels are monotically increasing
-  // if for some reason groundTruth completely skips some labels
-  // eg: {0,1,2,5} instead of {0,1,2,3}.
-  // Output matrix will still have empty rows for label value {3,4}
-  // Users can use "make_monotonic" to convert their discontinuous input label
-  // range to a monotonically increasing one  //
-  // this also serves as way to measure co-occurence/joint counts for NLP tasks which
-  // can be used to then compute pointwise mutual information and mutual information
-  if (minLabel == std::numeric_limits<T>::max() || maxLabel == std::numeric_limits<T>::max()) {
-    getInputClassCardinality<T>(groundTruth, nSamples, stream, minLabel, maxLabel);
-  }
-  auto outDimM_N = OutT(maxLabel - minLabel + 1);
-  RAFT_CUDA_TRY(cudaMemsetAsync(outMat, 0, sizeof(OutT) * outDimM_N * outDimM_N, stream));
-  ContingencyMatrixImplType implVersion = getImplVersion<OutT>(outDimM_N);
-  switch (implVersion) {
-    case SMEM_ATOMICS:
-      // smem atomics and then single global mem atomics only works
-      // when all label count can fit in smem for a block
-      // helps when GLOBAL_ATOMICS performance blocked by atomic update
-      // serialization -when very less labels ~10 labels
-      computeCMatWSmemAtomics<T, OutT>(
-        groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream);
-      break;
-    case GLOBAL_ATOMICS:
-      // launch kernel - global atomic ops per (groundTruth,predictedValue) pair
-      computeCMatWAtomics<T, OutT>(
-        groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream);
-      break;
-      // more L2 thrashing if atomic OPs land in completely different mem
-      // segment - when more labels
-    case SORT_AND_GATOMICS:
-      contingencyMatrixWSort<T, OutT>(groundTruth,
-                                      predictedLabel,
-                                      nSamples,
-                                      outMat,
-                                      minLabel,
-                                      maxLabel,
-                                      workspace,
-                                      workspaceSize,
-                                      stream);
-      break;
-    case IMPL_NONE: break;
-  }
-}
-
-};  // namespace Metrics
-};  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/dispersion.cuh b/cpp/src_prims/metrics/dispersion.cuh
deleted file mode 100644
index b2d3c007fb..0000000000
--- a/cpp/src_prims/metrics/dispersion.cuh
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <memory>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/interruptible.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-///@todo: ColsPerBlk has been tested only for 32!
-template <typename DataT, typename IdxT, int TPB, int ColsPerBlk = 32>
-__global__ void weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N)
-{
-  constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk;
-  IdxT thisColId                  = threadIdx.x % ColsPerBlk;
-  IdxT thisRowId                  = threadIdx.x / ColsPerBlk;
-  IdxT colId                      = thisColId + ((IdxT)blockIdx.y * ColsPerBlk);
-  IdxT rowId                      = thisRowId + ((IdxT)blockIdx.x * RowsPerBlkPerIter);
-  DataT thread_data               = DataT(0);
-  const IdxT stride               = RowsPerBlkPerIter * gridDim.x;
-  __shared__ DataT smu[ColsPerBlk];
-  if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = DataT(0);
-  for (IdxT i = rowId; i < N; i += stride) {
-    thread_data += (colId < D) ? data[i * D + colId] * (DataT)counts[i] : DataT(0);
-  }
-  __syncthreads();
-  raft::myAtomicAdd(smu + thisColId, thread_data);
-  __syncthreads();
-  if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]);
-}
-
-template <typename DataT, typename IdxT, int TPB>
-__global__ void dispersionKernel(DataT* result,
-                                 const DataT* clusters,
-                                 const IdxT* clusterSizes,
-                                 const DataT* mu,
-                                 IdxT dim,
-                                 IdxT nClusters)
-{
-  IdxT tid    = threadIdx.x + blockIdx.x * blockDim.x;
-  IdxT len    = dim * nClusters;
-  IdxT stride = blockDim.x * gridDim.x;
-  DataT sum   = DataT(0);
-  for (; tid < len; tid += stride) {
-    IdxT col   = tid % dim;
-    IdxT row   = tid / dim;
-    DataT diff = clusters[tid] - mu[col];
-    sum += diff * diff * DataT(clusterSizes[row]);
-  }
-  typedef cub::BlockReduce<DataT, TPB> BlockReduce;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  __syncthreads();
-  auto acc = BlockReduce(temp_storage).Sum(sum);
-  __syncthreads();
-  if (threadIdx.x == 0) raft::myAtomicAdd(result, acc);
-}
-
-/**
- * @brief Compute cluster dispersion metric. This is very useful for
- * automatically finding the 'k' (in kmeans) that improves this metric.
- * @tparam DataT data type
- * @tparam IdxT index type
- * @tparam TPB threads block for kernels launched
- * @param centroids the cluster centroids. This is assumed to be row-major
- *   and of dimension (nClusters x dim)
- * @param clusterSizes number of points in the dataset which belong to each
- *   cluster. This is of length nClusters
- * @param globalCentroid compute the global weighted centroid of all cluster
- *   centroids. This is of length dim. Pass a nullptr if this is not needed
- * @param nClusters number of clusters
- * @param nPoints number of points in the dataset
- * @param dim dataset dimensionality
- * @param stream cuda stream
- * @return the cluster dispersion value
- */
-template <typename DataT, typename IdxT = int, int TPB = 256>
-DataT dispersion(const DataT* centroids,
-                 const IdxT* clusterSizes,
-                 DataT* globalCentroid,
-                 IdxT nClusters,
-                 IdxT nPoints,
-                 IdxT dim,
-                 cudaStream_t stream)
-{
-  static const int RowsPerThread = 4;
-  static const int ColsPerBlk    = 32;
-  static const int RowsPerBlk    = (TPB / ColsPerBlk) * RowsPerThread;
-  dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk));
-  rmm::device_uvector<DataT> mean(0, stream);
-  rmm::device_uvector<DataT> result(1, stream);
-  DataT* mu = globalCentroid;
-  if (globalCentroid == nullptr) {
-    mean.resize(dim, stream);
-    mu = mean.data();
-  }
-  RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream));
-  weightedMeanKernel<DataT, IdxT, TPB, ColsPerBlk>
-    <<<grid, TPB, 0, stream>>>(mu, centroids, clusterSizes, dim, nClusters);
-  RAFT_CUDA_TRY(cudaGetLastError());
-  DataT ratio = DataT(1) / DataT(nPoints);
-  raft::linalg::scalarMultiply(mu, mu, ratio, dim, stream);
-  // finally, compute the dispersion
-  constexpr int ItemsPerThread = 4;
-  int nblks                    = raft::ceildiv<int>(dim * nClusters, TPB * ItemsPerThread);
-  dispersionKernel<DataT, IdxT, TPB>
-    <<<nblks, TPB, 0, stream>>>(result.data(), centroids, clusterSizes, mu, dim, nClusters);
-  RAFT_CUDA_TRY(cudaGetLastError());
-  DataT h_result;
-  raft::update_host(&h_result, result.data(), 1, stream);
-  raft::interruptible::synchronize(stream);
-  return sqrt(h_result);
-}
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/entropy.cuh b/cpp/src_prims/metrics/entropy.cuh
deleted file mode 100644
index 55650a3345..0000000000
--- a/cpp/src_prims/metrics/entropy.cuh
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file entropy.cuh
- * @brief Calculates the entropy for a labeling in nats.(ie, uses natural logarithm for the
- * calculations)
- */
-
-#include <cub/cub.cuh>
-#include <math.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/divide.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-
-/**
- * @brief Lambda to calculate the entropy of a sample given its probability value
- *
- * @param p: the input to the functional mapping
- * @param q: dummy param
- */
-struct entropyOp {
-  HDI double operator()(double p, double q)
-  {
-    if (p)
-      return -1 * (p) * (log(p));
-    else
-      return 0.0;
-  }
-};
-
-namespace Metrics {
-
-/**
- * @brief function to calculate the bincounts of number of samples in every label
- *
- * @tparam LabelT: type of the labels
- * @param labels: the pointer to the array containing labels for every data sample
- * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster
- * @param nRows: number of data samples
- * @param lowerLabelRange
- * @param upperLabelRange
- * @param workspace: device buffer containing workspace memory
- * @param stream: the cuda stream where to launch this kernel
- */
-template <typename LabelT>
-void countLabels(const LabelT* labels,
-                 double* binCountArray,
-                 int nRows,
-                 LabelT lowerLabelRange,
-                 LabelT upperLabelRange,
-                 rmm::device_uvector<char>& workspace,
-                 cudaStream_t stream)
-{
-  int num_levels            = upperLabelRange - lowerLabelRange + 2;
-  LabelT lower_level        = lowerLabelRange;
-  LabelT upper_level        = upperLabelRange + 1;
-  size_t temp_storage_bytes = 0;
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-}
-
-/**
- * @brief Function to calculate entropy
- * <a href="https://en.wikipedia.org/wiki/Entropy_(information_theory)">more info on entropy</a>
- *
- * @param clusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @return the entropy score
- */
-template <typename T>
-double entropy(const T* clusterArray,
-               const int size,
-               const T lowerLabelRange,
-               const T upperLabelRange,
-               cudaStream_t stream)
-{
-  if (!size) return 1.0;
-
-  T numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
-
-  // declaring, allocating and initializing memory for bincount array and entropy values
-  rmm::device_uvector<double> prob(numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream));
-  rmm::device_scalar<double> d_entropy(stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream));
-
-  // workspace allocation
-  rmm::device_uvector<char> workspace(1, stream);
-
-  // calculating the bincounts and populating the prob array
-  countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream);
-
-  // scalar dividing by size
-  raft::linalg::divideScalar<double>(
-    prob.data(), prob.data(), (double)size, numUniqueClasses, stream);
-
-  // calculating the aggregate entropy
-  raft::linalg::mapThenSumReduce<double, entropyOp>(
-    d_entropy.data(), numUniqueClasses, entropyOp(), stream, prob.data(), prob.data());
-
-  // updating in the host memory
-  double h_entropy;
-  raft::update_host(&h_entropy, d_entropy.data(), 1, stream);
-
-  raft::interruptible::synchronize(stream);
-
-  return h_entropy;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/homogeneity_score.cuh b/cpp/src_prims/metrics/homogeneity_score.cuh
deleted file mode 100644
index 8ba90b38e8..0000000000
--- a/cpp/src_prims/metrics/homogeneity_score.cuh
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file homogeneity_score.cuh
- *
- * @brief A clustering result satisfies homogeneity if all of its clusters
- * contain only data points which are members of a single class.
- */
-
-#include "entropy.cuh"
-#include "mutual_info_score.cuh"
-#include <raft/mr/device/allocator.hpp>
-
-namespace MLCommon {
-
-namespace Metrics {
-
-/**
- * @brief Function to calculate the homogeneity score between two clusters
- * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
- * information</a>
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double homogeneity_score(const T* truthClusterArray,
-                         const T* predClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  if (size == 0) return 1.0;
-
-  double computedMI, computedEntropy;
-
-  computedMI = MLCommon::Metrics::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedEntropy =
-    MLCommon::Metrics::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-
-  double homogeneity;
-
-  if (computedEntropy) {
-    homogeneity = computedMI / computedEntropy;
-  } else
-    homogeneity = 1.0;
-
-  return homogeneity;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/kl_divergence.cuh b/cpp/src_prims/metrics/kl_divergence.cuh
deleted file mode 100644
index bce3bf7283..0000000000
--- a/cpp/src_prims/metrics/kl_divergence.cuh
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file kl_divergence.cuh
- * @brief The KL divergence tells us how well the probability distribution Q AKA candidatePDF
- * approximates the probability distribution P AKA modelPDF.
- */
-
-#pragma once
-
-#include <math.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <rmm/device_scalar.hpp>
-
-namespace MLCommon {
-
-/**
- * @brief the KL Diverence mapping function
- *
- * @tparam Type: Data type of the input
- * @param modelPDF: the model probability density function of type DataT
- * @param candidatePDF: the candidate probability density function of type DataT
- */
-template <typename Type>
-struct KLDOp {
-  HDI Type operator()(Type modelPDF, Type candidatePDF)
-  {
-    if (modelPDF == 0.0)
-      return 0;
-
-    else
-      return modelPDF * (log(modelPDF) - log(candidatePDF));
-  }
-};
-
-namespace Metrics {
-
-/**
- * @brief Function to calculate KL Divergence
- * <a href="https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence">more info on KL
- * Divergence</a>
- *
- * @tparam DataT: Data type of the input array
- * @param modelPDF: the model array of probability density functions of type DataT
- * @param candidatePDF: the candidate array of probability density functions of type DataT
- * @param size: the size of the data points of type int
- * @param stream: the cudaStream object
- */
-template <typename DataT>
-DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream)
-{
-  rmm::device_scalar<DataT> d_KLDVal(stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream));
-
-  raft::linalg::mapThenSumReduce<DataT, KLDOp<DataT>, 256, const DataT*>(
-    d_KLDVal.data(), (size_t)size, KLDOp<DataT>(), stream, modelPDF, candidatePDF);
-
-  DataT h_KLDVal;
-
-  raft::update_host(&h_KLDVal, d_KLDVal.data(), 1, stream);
-
-  raft::interruptible::synchronize(stream);
-
-  return h_KLDVal;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/mutual_info_score.cuh b/cpp/src_prims/metrics/mutual_info_score.cuh
deleted file mode 100644
index f20de778e4..0000000000
--- a/cpp/src_prims/metrics/mutual_info_score.cuh
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file mutual_info_score.cuh
- * @brief The Mutual Information is a measure of the similarity between two labels of
- *   the same data.This metric is independent of the absolute values of the labels:
- *   a permutation of the class or cluster label values won't change the
- *   score value in any way.
- *   This metric is furthermore symmetric.This can be useful to
- *   measure the agreement of two independent label assignments strategies
- *   on the same dataset when the real ground truth is not known.
- */
-
-#include "contingencyMatrix.cuh"
-#include <cub/cub.cuh>
-#include <math.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/linalg/reduce.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-
-namespace Metrics {
-
-/**
- * @brief kernel to calculate the mutual info score
- * @param dContingencyMatrix: the contingency matrix corresponding to the two clusters
- * @param a: the row wise sum of the contingency matrix, which is also the bin counts of first
- * cluster array
- * @param b: the column wise sum of the contingency matrix, which is also the bin counts of second
- * cluster array
- * @param numUniqueClasses: number of unique classes
- * @param size: the size of array a and b (size of the contingency matrix is (size x size))
- * @param d_MI: pointer to the device memory that stores the aggreggate mutual information
- */
-template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void mutual_info_kernel(const int* dContingencyMatrix,
-                                   const int* a,
-                                   const int* b,
-                                   int numUniqueClasses,
-                                   int size,
-                                   double* d_MI)
-{
-  // calculating the indices of pairs of datapoints compared by the current thread
-  int j = threadIdx.x + blockIdx.x * blockDim.x;
-  int i = threadIdx.y + blockIdx.y * blockDim.y;
-
-  // thread-local variable to count the mutual info
-  double localMI = 0.0;
-
-  if (i < numUniqueClasses && j < numUniqueClasses && a[i] * b[j] != 0 &&
-      dContingencyMatrix[i * numUniqueClasses + j] != 0) {
-    localMI += (double(dContingencyMatrix[i * numUniqueClasses + j])) *
-               (log(double(size) * double(dContingencyMatrix[i * numUniqueClasses + j])) -
-                log(double(a[i] * b[j])));
-  }
-
-  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<double, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-    BlockReduce;
-
-  // Allocate shared memory for blockReduce
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // summing up thread-local counts specific to a block
-  localMI = BlockReduce(temp_storage).Sum(localMI);
-  __syncthreads();
-
-  // executed once per block
-  if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd(d_MI, localMI); }
-}
-
-/**
- * @brief Function to calculate the mutual information between two clusters
- * <a href="https://en.wikipedia.org/wiki/Mutual_information">more info on mutual information</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double mutual_info_score(const T* firstClusterArray,
-                         const T* secondClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
-
-  // declaring, allocating and initializing memory for the contingency marix
-  rmm::device_uvector<int> dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream));
-
-  // workspace allocation
-  size_t workspaceSz = MLCommon::Metrics::getContingencyMatrixWorkspaceSize(
-    size, firstClusterArray, stream, lowerLabelRange, upperLabelRange);
-  rmm::device_uvector<char> pWorkspace(workspaceSz, stream);
-
-  // calculating the contingency matrix
-  MLCommon::Metrics::contingencyMatrix(firstClusterArray,
-                                       secondClusterArray,
-                                       (int)size,
-                                       (int*)dContingencyMatrix.data(),
-                                       stream,
-                                       (void*)pWorkspace.data(),
-                                       workspaceSz,
-                                       lowerLabelRange,
-                                       upperLabelRange);
-
-  // creating device buffers for all the parameters involved in ARI calculation
-  // device variables
-  rmm::device_uvector<int> a(numUniqueClasses, stream);
-  rmm::device_uvector<int> b(numUniqueClasses, stream);
-  rmm::device_scalar<double> d_MI(stream);
-
-  // host variables
-  double h_MI;
-
-  // initializing device memory
-  RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream));
-
-  // calculating the row-wise sums
-  raft::linalg::reduce<int, int, int>(
-    a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, true, true, stream);
-
-  // calculating the column-wise sums
-  raft::linalg::reduce<int, int, int>(b.data(),
-                                      dContingencyMatrix.data(),
-                                      numUniqueClasses,
-                                      numUniqueClasses,
-                                      0,
-                                      true,
-                                      false,
-                                      stream);
-
-  // kernel configuration
-  static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
-  dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
-  dim3 numBlocks(raft::ceildiv<int>(numUniqueClasses, numThreadsPerBlock.x),
-                 raft::ceildiv<int>(numUniqueClasses, numThreadsPerBlock.y));
-
-  // calling the kernel
-  mutual_info_kernel<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-    dContingencyMatrix.data(), a.data(), b.data(), numUniqueClasses, size, d_MI.data());
-
-  // updating in the host memory
-  h_MI = d_MI.value(stream);
-
-  raft::interruptible::synchronize(stream);
-
-  return h_MI / size;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/rand_index.cuh b/cpp/src_prims/metrics/rand_index.cuh
deleted file mode 100644
index f1acf30ac5..0000000000
--- a/cpp/src_prims/metrics/rand_index.cuh
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file rand_index.cuh
- * @todo TODO(Ganesh Venkataramana):
- * <pre>
- * The below rand_index calculation implementation is a Brute force one that uses
- (nElements*nElements) threads (2 dimensional grids and blocks)
- * For small datasets, this will suffice; but for larger ones, work done by the threads increase
- dramatically.
- * A more mathematically intensive implementation that uses half the above threads can be done,
- which will prove to be more efficient for larger datasets
- * the idea is as follows:
-  * instead of 2D block and grid configuration with a total of (nElements*nElements) threads (where
- each (i,j) through these threads represent an ordered pair selection of 2 data points), a 1D block
- and grid configuration with a total of (nElements*(nElements))/2 threads (each thread index
- represents an element part of the set of unordered pairwise selections from the dataset (nChoose2))
-  * In this setup, one has to generate a one-to-one mapping between this 1D thread index (for each
- kernel) and the unordered pair of chosen datapoints.
-  * More specifically, thread0-> {dataPoint1, dataPoint0}, thread1-> {dataPoint2, dataPoint0},
- thread2-> {dataPoint2, dataPoint1} ... thread((nElements*(nElements))/2 - 1)->
- {dataPoint(nElements-1),dataPoint(nElements-2)}
-  * say ,
-     * threadNum: thread index | threadNum = threadIdx.x + BlockIdx.x*BlockDim.x,
-     * i : index of dataPoint i
-     * j : index of dataPoint j
-  * then the mapping is as follows:
-     * i = ceil((-1 + sqrt(1 + 8*(1 + threadNum)))/2) = floor((1 + sqrt(1 + 8*threadNum))/2)
-     * j = threadNum - i(i-1)/2
-  * after obtaining the the pair of datapoints, calculation of rand index is the same as done in
- this implementation
- * Caveat: since the kernel implementation involves use of emulated sqrt() operations:
-  * the number of instructions executed per kernel is ~40-50 times
-  * as the O(nElements*nElements) increase beyond the floating point limit, floating point
- inaccuracies occur, and hence the above floor(...) !=  ceil(...)
- * </pre>
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-#include <math.h>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-/**
- * @brief kernel to calculate the values of a and b
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points
- * @param a: number of pairs of points that both the clusters have classified the same
- * @param b: number of pairs of points that both the clusters have classified differently
- */
-template <typename T, int BLOCK_DIM_X, int BLOCK_DIM_Y>
-__global__ void computeTheNumerator(
-  const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b)
-{
-  // calculating the indices of pairs of datapoints compared by the current thread
-  uint64_t j = threadIdx.x + blockIdx.x * blockDim.x;
-  uint64_t i = threadIdx.y + blockIdx.y * blockDim.y;
-
-  // thread-local variables to count a and b
-  uint64_t myA = 0, myB = 0;
-
-  if (i < size && j < size && j < i) {
-    // checking if the pair have been classified the same by both the clusters
-    if (firstClusterArray[i] == firstClusterArray[j] &&
-        secondClusterArray[i] == secondClusterArray[j]) {
-      ++myA;
-    }
-
-    // checking if the pair have been classified differently by both the clusters
-    else if (firstClusterArray[i] != firstClusterArray[j] &&
-             secondClusterArray[i] != secondClusterArray[j]) {
-      ++myB;
-    }
-  }
-
-  // specialize blockReduce for a 2D block of 1024 threads of type uint64_t
-  typedef cub::BlockReduce<uint64_t, BLOCK_DIM_X, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BLOCK_DIM_Y>
-    BlockReduce;
-
-  // Allocate shared memory for blockReduce
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-
-  // summing up thread-local counts specific to a block
-  myA = BlockReduce(temp_storage).Sum(myA);
-  __syncthreads();
-  myB = BlockReduce(temp_storage).Sum(myB);
-  __syncthreads();
-
-  // executed once per block
-  if (threadIdx.x == 0 && threadIdx.y == 0) {
-    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)a, myA);
-    raft::myAtomicAdd<unsigned long long int>((unsigned long long int*)b, myB);
-  }
-}
-
-/**
- * @brief Function to calculate RandIndex
- * <a href="https://en.wikipedia.org/wiki/Rand_index">more info on rand index</a>
- * @param firstClusterArray: the array of classes of type T
- * @param secondClusterArray: the array of classes of type T
- * @param size: the size of the data points of type uint64_t
- * @param stream: the cudaStream object
- */
-template <typename T>
-double compute_rand_index(T* firstClusterArray,
-                          T* secondClusterArray,
-                          uint64_t size,
-                          cudaStream_t stream)
-{
-  // rand index for size less than 2 is not defined
-  ASSERT(size >= 2, "Rand Index for size less than 2 not defined!");
-
-  // allocating and initializing memory for a and b in the GPU
-  rmm::device_uvector<uint64_t> arr_buf(2, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream));
-
-  // kernel configuration
-  static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16;
-  dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y);
-  dim3 numBlocks(raft::ceildiv<int>(size, numThreadsPerBlock.x),
-                 raft::ceildiv<int>(size, numThreadsPerBlock.y));
-
-  // calling the kernel
-  computeTheNumerator<T, BLOCK_DIM_X, BLOCK_DIM_Y><<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-    firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1);
-
-  // synchronizing and updating the calculated values of a and b from device to host
-  uint64_t ab_host[2] = {0};
-  raft::update_host(ab_host, arr_buf.data(), 2, stream);
-  raft::interruptible::synchronize(stream);
-
-  // error handling
-  RAFT_CUDA_TRY(cudaGetLastError());
-
-  // denominator
-  uint64_t nChooseTwo = size * (size - 1) / 2;
-
-  // calculating the rand_index
-  return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo);
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/metrics/scores.cuh b/cpp/src_prims/metrics/scores.cuh
deleted file mode 100644
index 50fa3eca04..0000000000
--- a/cpp/src_prims/metrics/scores.cuh
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <memory>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/power.cuh>
-#include <raft/linalg/subtract.hpp>
-#include <raft/spatial/knn/knn.hpp>
-#include <raft/stats/mean.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <selection/columnWiseSort.cuh>
-#include <thrust/device_ptr.h>
-#include <thrust/reduce.h>
-
-#define N_THREADS 512
-
-namespace MLCommon {
-namespace Score {
-
-/**
- * Calculates the "Coefficient of Determination" (R-Squared) score
- * normalizing the sum of squared errors by the total sum of squares.
- *
- * This score indicates the proportionate amount of variation in an
- * expected response variable is explained by the independent variables
- * in a linear regression model. The larger the R-squared value, the
- * more variability is explained by the linear regression model.
- *
- * @param y: Array of ground-truth response variables
- * @param y_hat: Array of predicted response variables
- * @param n: Number of elements in y and y_hat
- * @param stream: cuda stream
- * @return: The R-squared value.
- */
-template <typename math_t>
-math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream)
-{
-  rmm::device_scalar<math_t> y_bar(stream);
-
-  raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  rmm::device_uvector<math_t> sse_arr(n, stream);
-
-  raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream);
-  raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  rmm::device_uvector<math_t> ssto_arr(n, stream);
-
-  raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream);
-  raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  thrust::device_ptr<math_t> d_sse  = thrust::device_pointer_cast(sse_arr.data());
-  thrust::device_ptr<math_t> d_ssto = thrust::device_pointer_cast(ssto_arr.data());
-
-  math_t sse  = thrust::reduce(thrust::cuda::par.on(stream), d_sse, d_sse + n);
-  math_t ssto = thrust::reduce(thrust::cuda::par.on(stream), d_ssto, d_ssto + n);
-
-  return 1.0 - sse / ssto;
-}
-
-/**
- * @brief Compute accuracy of predictions. Useful for classification.
- * @tparam math_t: data type for predictions (e.g., int for classification)
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions.
- * @param[in] stream: cuda stream.
- * @return: Accuracy score in [0, 1]; higher is better.
- */
-template <typename math_t>
-float accuracy_score(const math_t* predictions,
-                     const math_t* ref_predictions,
-                     int n,
-                     cudaStream_t stream)
-{
-  unsigned long long correctly_predicted = 0ULL;
-  rmm::device_uvector<math_t> diffs_array(n, stream);
-
-  // TODO could write a kernel instead
-  raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream);
-  RAFT_CUDA_TRY(cudaGetLastError());
-  correctly_predicted =
-    thrust::count(thrust::cuda::par.on(stream), diffs_array.data(), diffs_array.data() + n, 0);
-
-  float accuracy = correctly_predicted * 1.0f / n;
-  return accuracy;
-}
-
-template <typename T>
-__global__ void reg_metrics_kernel(
-  const T* predictions, const T* ref_predictions, int n, double* abs_diffs, double* tmp_sums)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  __shared__ double shmem[2];  // {abs_difference_sum, squared difference sum}
-
-  for (int i = threadIdx.x; i < 2; i += blockDim.x) {
-    shmem[i] = 0;
-  }
-  __syncthreads();
-
-  for (int i = tid; i < n; i += blockDim.x * gridDim.x) {
-    double diff     = predictions[i] - ref_predictions[i];
-    double abs_diff = abs(diff);
-    raft::myAtomicAdd(&shmem[0], abs_diff);
-    raft::myAtomicAdd(&shmem[1], diff * diff);
-
-    // update absolute difference in global memory for subsequent abs. median computation
-    abs_diffs[i] = abs_diff;
-  }
-  __syncthreads();
-
-  // Update tmp_sum w/ total abs_difference_sum and squared difference sum.
-  for (int i = threadIdx.x; i < 2; i += blockDim.x) {
-    raft::myAtomicAdd(&tmp_sums[i], shmem[i]);
-  }
-}
-
-/**
- * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error
- * @tparam T: data type for predictions (e.g., float or double for regression).
- * @param[in] predictions: array of predictions (GPU pointer).
- * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer).
- * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0.
- * @param[in] stream: cuda stream.
- * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] -
- * ref_predictions[i]|) / n.
- * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] -
- * ref_predictions[i])^2) / n.
- * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] -
- * ref_predictions[i]| for i in [0, n).
- */
-template <typename T>
-void regression_metrics(const T* predictions,
-                        const T* ref_predictions,
-                        int n,
-                        cudaStream_t stream,
-                        double& mean_abs_error,
-                        double& mean_squared_error,
-                        double& median_abs_error)
-{
-  std::vector<double> mean_errors(2);
-  std::vector<double> h_sorted_abs_diffs(n);
-  int thread_cnt = 256;
-  int block_cnt  = raft::ceildiv(n, thread_cnt);
-
-  int array_size = n * sizeof(double);
-  rmm::device_uvector<double> abs_diffs_array(array_size, stream);
-  rmm::device_uvector<double> sorted_abs_diffs(array_size, stream);
-  rmm::device_uvector<double> tmp_sums(2 * sizeof(double), stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream));
-
-  reg_metrics_kernel<T><<<block_cnt, thread_cnt, 0, stream>>>(
-    predictions, ref_predictions, n, abs_diffs_array.data(), tmp_sums.data());
-  RAFT_CUDA_TRY(cudaGetLastError());
-  raft::update_host(&mean_errors[0], tmp_sums.data(), 2, stream);
-  raft::interruptible::synchronize(stream);
-
-  mean_abs_error     = mean_errors[0] / n;
-  mean_squared_error = mean_errors[1] / n;
-
-  // Compute median error. Sort diffs_array and pick median value
-  char* temp_storage = nullptr;
-  size_t temp_storage_bytes;
-  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
-                                               temp_storage_bytes,
-                                               abs_diffs_array.data(),
-                                               sorted_abs_diffs.data(),
-                                               n,
-                                               0,
-                                               8 * sizeof(double),
-                                               stream));
-  rmm::device_uvector<char> temp_storage_v(temp_storage_bytes, stream);
-  temp_storage = temp_storage_v.data();
-  RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage,
-                                               temp_storage_bytes,
-                                               abs_diffs_array.data(),
-                                               sorted_abs_diffs.data(),
-                                               n,
-                                               0,
-                                               8 * sizeof(double),
-                                               stream));
-
-  raft::update_host(h_sorted_abs_diffs.data(), sorted_abs_diffs.data(), n, stream);
-  raft::interruptible::synchronize(stream);
-
-  int middle = n / 2;
-  if (n % 2 == 1) {
-    median_abs_error = h_sorted_abs_diffs[middle];
-  } else {
-    median_abs_error = (h_sorted_abs_diffs[middle] + h_sorted_abs_diffs[middle - 1]) / 2;
-  }
-}
-}  // namespace Score
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/silhouette_score.cuh b/cpp/src_prims/metrics/silhouette_score.cuh
deleted file mode 100644
index fa23e85722..0000000000
--- a/cpp/src_prims/metrics/silhouette_score.cuh
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cub/cub.cuh>
-#include <cuml/metrics/metrics.hpp>
-#include <iostream>
-#include <math.h>
-#include <numeric>
-#include <raft/cuda_utils.cuh>
-#include <raft/distance/distance.hpp>
-#include <raft/distance/distance_type.hpp>
-#include <raft/linalg/add.hpp>
-#include <raft/linalg/eltwise.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/reduce.hpp>
-#include <raft/linalg/reduce_cols_by_key.cuh>
-#include <rmm/device_scalar.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-/**
- * @brief kernel that calculates the average intra-cluster distance for every sample data point and
- * updates the cluster distance to max value
- * @tparam DataT: type of the data samples
- * @tparam LabelT: type of the labels
- * @param sampleToClusterSumOfDistances: the pointer to the 2D array that contains the sum of
- * distances from every sample to every cluster (nRows x nLabels)
- * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
- * nLabels)
- * @param d_aArray: the pointer to the array of average intra-cluster distances for every sample in
- * device memory (1 x nRows)
- * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
- * @param nRows: number of data samples
- * @param nLabels: number of Labels
- * @param MAX_VAL: DataT specific upper limit
- */
-template <typename DataT, typename LabelT>
-__global__ void populateAKernel(DataT* sampleToClusterSumOfDistances,
-                                DataT* binCountArray,
-                                DataT* d_aArray,
-                                LabelT* labels,
-                                int nRows,
-                                int nLabels,
-                                const DataT MAX_VAL)
-{
-  // getting the current index
-  int sampleIndex = threadIdx.x + blockIdx.x * blockDim.x;
-
-  if (sampleIndex >= nRows) return;
-
-  // sampleDistanceVector is an array that stores that particular row of the distanceMatrix
-  DataT* sampleToClusterSumOfDistancesVector =
-    &sampleToClusterSumOfDistances[sampleIndex * nLabels];
-
-  LabelT sampleCluster = labels[sampleIndex];
-
-  int sampleClusterIndex = (int)sampleCluster;
-
-  if (binCountArray[sampleClusterIndex] - 1 <= 0) {
-    d_aArray[sampleIndex] = -1;
-    return;
-
-  }
-
-  else {
-    d_aArray[sampleIndex] = (sampleToClusterSumOfDistancesVector[sampleClusterIndex]) /
-                            (binCountArray[sampleClusterIndex] - 1);
-
-    // modifying the sampleDistanceVector to give sample average distance
-    sampleToClusterSumOfDistancesVector[sampleClusterIndex] = MAX_VAL;
-  }
-}
-
-/**
- * @brief function to calculate the bincounts of number of samples in every label
- * @tparam DataT: type of the data samples
- * @tparam LabelT: type of the labels
- * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
- * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x
- * nLabels)
- * @param nRows: number of data samples
- * @param nUniqueLabels: number of Labels
- * @param workspace: device buffer containing workspace memory
- * @param stream: the cuda stream where to launch this kernel
- */
-template <typename DataT, typename LabelT>
-void countLabels(LabelT* labels,
-                 DataT* binCountArray,
-                 int nRows,
-                 int nUniqueLabels,
-                 rmm::device_uvector<char>& workspace,
-                 cudaStream_t stream)
-{
-  int num_levels            = nUniqueLabels + 1;
-  LabelT lower_level        = 0;
-  LabelT upper_level        = nUniqueLabels;
-  size_t temp_storage_bytes = 0;
-
-  rmm::device_uvector<int> countArray(nUniqueLabels, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr,
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-
-  workspace.resize(temp_storage_bytes, stream);
-
-  RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(),
-                                                    temp_storage_bytes,
-                                                    labels,
-                                                    binCountArray,
-                                                    num_levels,
-                                                    lower_level,
-                                                    upper_level,
-                                                    nRows,
-                                                    stream));
-}
-
-/**
- * @brief stucture that defines the division Lambda for elementwise op
- */
-template <typename DataT>
-struct DivOp {
-  HDI DataT operator()(DataT a, int b, int c)
-  {
-    if (b == 0)
-      return ULLONG_MAX;
-    else
-      return a / b;
-  }
-};
-
-/**
- * @brief stucture that defines the elementwise operation to calculate silhouette score using params
- * 'a' and 'b'
- */
-template <typename DataT>
-struct SilOp {
-  HDI DataT operator()(DataT a, DataT b)
-  {
-    if (a == 0 && b == 0 || a == b)
-      return 0;
-    else if (a == -1)
-      return 0;
-    else if (a > b)
-      return (b - a) / a;
-    else
-      return (b - a) / b;
-  }
-};
-
-/**
- * @brief stucture that defines the reduction Lambda to find minimum between elements
- */
-template <typename DataT>
-struct MinOp {
-  HDI DataT operator()(DataT a, DataT b)
-  {
-    if (a > b)
-      return b;
-    else
-      return a;
-  }
-};
-
-/**
- * @brief main function that returns the average silhouette score for a given set of data and its
- * clusterings
- * @tparam DataT: type of the data samples
- * @tparam LabelT: type of the labels
- * @param X_in: pointer to the input Data samples array (nRows x nCols)
- * @param nRows: number of data samples
- * @param nCols: number of features
- * @param labels: the pointer to the array containing labels for every data sample (1 x nRows)
- * @param nLabels: number of Labels
- * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and
- * is populated with the silhouette score for every sample (1 x nRows)
- * @param stream: the cuda stream where to launch this kernel
- * @param metric: the numerical value that maps to the type of distance metric to be used in the
- * calculations
- */
-template <typename DataT, typename LabelT>
-DataT silhouette_score(
-  const raft::handle_t& handle,
-  DataT* X_in,
-  int nRows,
-  int nCols,
-  LabelT* labels,
-  int nLabels,
-  DataT* silhouette_scorePerSample,
-  cudaStream_t stream,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  ASSERT(nLabels >= 2 && nLabels <= (nRows - 1),
-         "silhouette Score not defined for the given number of labels!");
-
-  // compute the distance matrix
-  rmm::device_uvector<DataT> distanceMatrix(nRows * nRows, stream);
-  rmm::device_uvector<char> workspace(1, stream);
-
-  ML::Metrics::pairwise_distance(
-    handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric);
-
-  // deciding on the array of silhouette scores for each dataPoint
-  rmm::device_uvector<DataT> silhouette_scoreSamples(0, stream);
-  DataT* perSampleSilScore = nullptr;
-  if (silhouette_scorePerSample == nullptr) {
-    silhouette_scoreSamples.resize(nRows, stream);
-    perSampleSilScore = silhouette_scoreSamples.data();
-  } else {
-    perSampleSilScore = silhouette_scorePerSample;
-  }
-  RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream));
-
-  // getting the sample count per cluster
-  rmm::device_uvector<DataT> binCountArray(nLabels, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream));
-  countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream);
-
-  // calculating the sample-cluster-distance-sum-array
-  rmm::device_uvector<DataT> sampleToClusterSumOfDistances(nRows * nLabels, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream));
-  raft::linalg::reduce_cols_by_key(distanceMatrix.data(),
-                                   labels,
-                                   sampleToClusterSumOfDistances.data(),
-                                   nRows,
-                                   nRows,
-                                   nLabels,
-                                   stream);
-
-  // creating the a array and b array
-  rmm::device_uvector<DataT> d_aArray(nRows, stream);
-  rmm::device_uvector<DataT> d_bArray(nRows, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream));
-
-  // kernel that populates the d_aArray
-  // kernel configuration
-  dim3 numThreadsPerBlock(32, 1, 1);
-  dim3 numBlocks(raft::ceildiv<int>(nRows, numThreadsPerBlock.x), 1, 1);
-
-  // calling the kernel
-  populateAKernel<<<numBlocks, numThreadsPerBlock, 0, stream>>>(
-    sampleToClusterSumOfDistances.data(),
-    binCountArray.data(),
-    d_aArray.data(),
-    labels,
-    nRows,
-    nLabels,
-    std::numeric_limits<DataT>::max());
-
-  // elementwise dividing by bincounts
-  rmm::device_uvector<DataT> averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(
-    averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream));
-
-  raft::linalg::matrixVectorOp<DataT, DivOp<DataT>>(averageDistanceBetweenSampleAndCluster.data(),
-                                                    sampleToClusterSumOfDistances.data(),
-                                                    binCountArray.data(),
-                                                    binCountArray.data(),
-                                                    nLabels,
-                                                    nRows,
-                                                    true,
-                                                    true,
-                                                    DivOp<DataT>(),
-                                                    stream);
-
-  // calculating row-wise minimum
-  raft::linalg::reduce<DataT, DataT, int, raft::Nop<DataT>, MinOp<DataT>>(
-    d_bArray.data(),
-    averageDistanceBetweenSampleAndCluster.data(),
-    nLabels,
-    nRows,
-    std::numeric_limits<DataT>::max(),
-    true,
-    true,
-    stream,
-    false,
-    raft::Nop<DataT>(),
-    MinOp<DataT>());
-
-  // calculating the silhouette score per sample using the d_aArray and d_bArray
-  raft::linalg::binaryOp<DataT, SilOp<DataT>>(
-    perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp<DataT>(), stream);
-
-  // calculating the sum of all the silhouette score
-  rmm::device_scalar<DataT> d_avgSilhouetteScore(stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream));
-
-  raft::linalg::mapThenSumReduce<double, raft::Nop<DataT>>(d_avgSilhouetteScore.data(),
-                                                           nRows,
-                                                           raft::Nop<DataT>(),
-                                                           stream,
-                                                           perSampleSilScore,
-                                                           perSampleSilScore);
-
-  DataT avgSilhouetteScore = d_avgSilhouetteScore.value(stream);
-
-  handle.sync_stream(stream);
-
-  avgSilhouetteScore /= nRows;
-
-  return avgSilhouetteScore;
-}
-
-};  // namespace Metrics
-};  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/trustworthiness_score.cuh b/cpp/src_prims/metrics/trustworthiness_score.cuh
deleted file mode 100644
index c476cd209d..0000000000
--- a/cpp/src_prims/metrics/trustworthiness_score.cuh
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cuml/metrics/metrics.hpp>
-#include <raft/distance/specializations.hpp>
-#include <raft/spatial/knn/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <selection/columnWiseSort.cuh>
-
-#define N_THREADS 512
-
-namespace MLCommon {
-namespace Score {
-
-/**
- * @brief Build the lookup table
- * @param[out] lookup_table: Lookup table giving nearest neighbor order
- *                of pairwise distance calculations given sample index
- * @param[in] X_ind: Sorted indexes of pairwise distance calculations of X
- * @param n: Number of samples
- * @param work: Number of elements to consider
- */
-__global__ void build_lookup_table(int* lookup_table, const int* X_ind, int n, int work)
-{
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= work) return;
-
-  int sample_idx = i / n;
-  int nn_idx     = i % n;
-
-  int idx                              = X_ind[i];
-  lookup_table[(sample_idx * n) + idx] = nn_idx;
-}
-
-/**
- * @brief Compute a the rank of trustworthiness score
- * @param[out] rank: Resulting rank
- * @param[out] lookup_table: Lookup table giving nearest neighbor order
- *                of pairwise distance calculations given sample index
- * @param[in] emb_ind: Indexes of KNN on embeddings
- * @param n: Number of samples
- * @param n_neighbors: Number of neighbors considered by trustworthiness score
- * @param work: Batch to consider (to do it at once use n * n_neighbors)
- */
-template <typename knn_index_t>
-__global__ void compute_rank(double* rank,
-                             const int* lookup_table,
-                             const knn_index_t* emb_ind,
-                             int n,
-                             int n_neighbors,
-                             int work)
-{
-  int i = blockIdx.x * blockDim.x + threadIdx.x;
-  if (i >= work) return;
-
-  int sample_idx = i / n_neighbors;
-
-  knn_index_t emb_nn_ind = emb_ind[i];
-
-  int r   = lookup_table[(sample_idx * n) + emb_nn_ind];
-  int tmp = r - n_neighbors + 1;
-  if (tmp > 0) raft::myAtomicAdd<double>(rank, tmp);
-}
-
-/**
- * @brief Compute a kNN and returns the indices of the nearest neighbors
- * @param h Raft handle
- * @param[in] input Input matrix containing the dataset
- * @param n Number of samples
- * @param d Number of features
- * @param n_neighbors number of neighbors
- * @param[out] indices KNN indexes
- * @param[out] distances KNN distances
- */
-template <raft::distance::DistanceType distance_type, typename math_t>
-void run_knn(const raft::handle_t& h,
-             math_t* input,
-             int n,
-             int d,
-             int n_neighbors,
-             int64_t* indices,
-             math_t* distances)
-{
-  std::vector<math_t*> ptrs(1);
-  std::vector<int> sizes(1);
-  ptrs[0]  = input;
-  sizes[0] = n;
-
-  raft::spatial::knn::brute_force_knn<int64_t, float, int>(h,
-                                                           ptrs,
-                                                           sizes,
-                                                           d,
-                                                           input,
-                                                           n,
-                                                           indices,
-                                                           distances,
-                                                           n_neighbors,
-                                                           true,
-                                                           true,
-                                                           nullptr,
-                                                           distance_type);
-}
-
-/**
- * @brief Compute the trustworthiness score
- * @param h Raft handle
- * @param X[in]: Data in original dimension
- * @param X_embedded[in]: Data in target dimension (embedding)
- * @param n: Number of samples
- * @param m: Number of features in high/original dimension
- * @param d: Number of features in low/embedded dimension
- * @param n_neighbors Number of neighbors considered by trustworthiness score
- * @param batchSize Batch size
- * @return Trustworthiness score
- */
-template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
-                             const math_t* X,
-                             math_t* X_embedded,
-                             int n,
-                             int m,
-                             int d,
-                             int n_neighbors,
-                             int batchSize = 512)
-{
-  cudaStream_t stream = h.get_stream();
-
-  const int KNN_ALLOC = n * (n_neighbors + 1);
-  rmm::device_uvector<int64_t> emb_ind(KNN_ALLOC, stream);
-  rmm::device_uvector<math_t> emb_dist(KNN_ALLOC, stream);
-
-  run_knn<distance_type>(h, X_embedded, n, d, n_neighbors + 1, emb_ind.data(), emb_dist.data());
-
-  const int PAIRWISE_ALLOC = batchSize * n;
-  rmm::device_uvector<int> X_ind(PAIRWISE_ALLOC, stream);
-  rmm::device_uvector<math_t> X_dist(PAIRWISE_ALLOC, stream);
-  rmm::device_uvector<int> lookup_table(PAIRWISE_ALLOC, stream);
-
-  double t = 0.0;
-  rmm::device_scalar<double> t_dbuf(stream);
-
-  int toDo = n;
-  while (toDo > 0) {
-    int curBatchSize = min(toDo, batchSize);
-
-    // Takes at most batchSize vectors at a time
-    ML::Metrics::pairwise_distance(
-      h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type);
-
-    size_t colSortWorkspaceSize = 0;
-    bool bAllocWorkspace        = false;
-
-    MLCommon::Selection::sortColumnsPerRow(X_dist.data(),
-                                           X_ind.data(),
-                                           curBatchSize,
-                                           n,
-                                           bAllocWorkspace,
-                                           nullptr,
-                                           colSortWorkspaceSize,
-                                           stream);
-
-    if (bAllocWorkspace) {
-      rmm::device_uvector<char> sortColsWorkspace(colSortWorkspaceSize, stream);
-
-      MLCommon::Selection::sortColumnsPerRow(X_dist.data(),
-                                             X_ind.data(),
-                                             curBatchSize,
-                                             n,
-                                             bAllocWorkspace,
-                                             sortColsWorkspace.data(),
-                                             colSortWorkspaceSize,
-                                             stream);
-    }
-
-    int work     = curBatchSize * n;
-    int n_blocks = raft::ceildiv(work, N_THREADS);
-    build_lookup_table<<<n_blocks, N_THREADS, 0, stream>>>(
-      lookup_table.data(), X_ind.data(), n, work);
-
-    RAFT_CUDA_TRY(cudaMemsetAsync(t_dbuf.data(), 0, sizeof(double), stream));
-
-    work     = curBatchSize * (n_neighbors + 1);
-    n_blocks = raft::ceildiv(work, N_THREADS);
-    compute_rank<<<n_blocks, N_THREADS, 0, stream>>>(
-      t_dbuf.data(),
-      lookup_table.data(),
-      &emb_ind.data()[(n - toDo) * (n_neighbors + 1)],
-      n,
-      n_neighbors + 1,
-      work);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    t += t_dbuf.value(stream);
-
-    toDo -= curBatchSize;
-  }
-
-  t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t);
-
-  return t;
-}
-}  // namespace Score
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/v_measure.cuh b/cpp/src_prims/metrics/v_measure.cuh
deleted file mode 100644
index e0396c5702..0000000000
--- a/cpp/src_prims/metrics/v_measure.cuh
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file v_measure.cuh
- */
-
-#include "homogeneity_score.cuh"
-
-namespace MLCommon {
-
-namespace Metrics {
-
-/**
- * @brief Function to calculate the v-measure between two clusters
- *
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- * @param beta: v_measure parameter
- */
-template <typename T>
-double v_measure(const T* truthClusterArray,
-                 const T* predClusterArray,
-                 int size,
-                 T lowerLabelRange,
-                 T upperLabelRange,
-                 cudaStream_t stream,
-                 double beta = 1.0)
-{
-  double computedHomogeity, computedCompleteness, computedVMeasure;
-
-  computedHomogeity = MLCommon::Metrics::homogeneity_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedCompleteness = MLCommon::Metrics::homogeneity_score(
-    predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-
-  if (computedCompleteness + computedHomogeity == 0.0)
-    computedVMeasure = 0.0;
-  else
-    computedVMeasure = ((1 + beta) * computedHomogeity * computedCompleteness /
-                        (beta * computedHomogeity + computedCompleteness));
-
-  return computedVMeasure;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/selection/columnWiseSort.cuh b/cpp/src_prims/selection/columnWiseSort.cuh
deleted file mode 100644
index 6db4f3bf7f..0000000000
--- a/cpp/src_prims/selection/columnWiseSort.cuh
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cuda_runtime.h>
-#include <raft/cuda_utils.cuh>
-
-#include <cub/cub.cuh>
-#include <limits>
-#include <map>
-
-#define INST_BLOCK_SORT(keyIn, keyOut, valueInOut, rows, columns, blockSize, elemPT, stream)     \
-  devKeyValSortColumnPerRow<InType, OutType, blockSize, elemPT><<<rows, blockSize, 0, stream>>>( \
-    keyIn, keyOut, valueInOut, rows, columns, std::numeric_limits<InType>::max())
-
-namespace MLCommon {
-namespace Selection {
-
-template <typename InType, int BLOCK_SIZE>
-struct TemplateChecker {
-  enum {
-    IsValid = (std::is_same<InType, short>::value && BLOCK_SIZE <= 1024) ||
-              (std::is_same<InType, int>::value && BLOCK_SIZE <= 1024) ||
-              (std::is_same<InType, float>::value && BLOCK_SIZE <= 1024) ||
-              (std::is_same<InType, double>::value && BLOCK_SIZE <= 512)
-  };
-};
-
-template <typename InType, typename OutType, int BLOCK_SIZE, int ITEMS_PER_THREAD>
-struct SmemPerBlock {
-  typedef cub::BlockLoad<InType, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>
-    BlockLoadTypeKey;
-
-  typedef cub::BlockRadixSort<InType, BLOCK_SIZE, ITEMS_PER_THREAD, OutType> BlockRadixSortType;
-
-  union TempStorage {
-    typename BlockLoadTypeKey::TempStorage keyLoad;
-    typename BlockRadixSortType::TempStorage sort;
-  } tempStorage;
-};
-
-template <typename InType>
-__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements)
-{
-  int idx = threadIdx.x + blockDim.x * blockIdx.x;
-  int n   = n_cols;
-
-  if (idx < totalElements) { in[idx] = idx % n; }
-}
-
-template <typename T>
-__global__ void devOffsetKernel(T* in, T value, int n_times)
-{
-  int idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < n_times) in[idx] = idx * value;
-}
-
-// block level radix sort - can only sort as much data we can fit within shared memory
-template <
-  typename InType,
-  typename OutType,
-  int BLOCK_SIZE,
-  int ITEMS_PER_THREAD,
-  typename std::enable_if<TemplateChecker<InType, BLOCK_SIZE>::IsValid, InType>::type* = nullptr>
-__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys,
-                                                                     InType* outputKeys,
-                                                                     OutType* inputVals,
-                                                                     int n_rows,
-                                                                     int n_cols,
-                                                                     InType MAX_VALUE)
-{
-  typedef cub::BlockLoad<InType, BLOCK_SIZE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>
-    BlockLoadTypeKey;
-
-  typedef cub::BlockRadixSort<InType, BLOCK_SIZE, ITEMS_PER_THREAD, OutType> BlockRadixSortType;
-
-  __shared__ SmemPerBlock<InType, OutType, BLOCK_SIZE, ITEMS_PER_THREAD> tmpSmem;
-
-  InType threadKeys[ITEMS_PER_THREAD];
-  OutType threadValues[ITEMS_PER_THREAD];
-
-  int blockOffset = blockIdx.x * n_cols;
-  BlockLoadTypeKey(tmpSmem.tempStorage.keyLoad)
-    .Load(inputKeys + blockOffset, threadKeys, n_cols, MAX_VALUE);
-
-  OutType idxBase = threadIdx.x * ITEMS_PER_THREAD;
-  for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-    OutType eId = idxBase + (OutType)i;
-    if (eId < n_cols)
-      threadValues[i] = eId;
-    else
-      threadValues[i] = MAX_VALUE;
-  }
-
-  __syncthreads();
-
-  BlockRadixSortType(tmpSmem.tempStorage.sort).SortBlockedToStriped(threadKeys, threadValues);
-
-  // storing index values back (not keys)
-  cub::StoreDirectStriped<BLOCK_SIZE>(threadIdx.x, inputVals + blockOffset, threadValues, n_cols);
-
-  if (outputKeys) {
-    cub::StoreDirectStriped<BLOCK_SIZE>(threadIdx.x, outputKeys + blockOffset, threadKeys, n_cols);
-  }
-}
-
-template <
-  typename InType,
-  typename OutType,
-  int BLOCK_SIZE,
-  int ITEMS_PER_THREAD,
-  typename std::enable_if<!(TemplateChecker<InType, BLOCK_SIZE>::IsValid), InType>::type* = nullptr>
-__global__ void devKeyValSortColumnPerRow(const InType* inputKeys,
-                                          InType* outputKeys,
-                                          OutType* inputVals,
-                                          int n_rows,
-                                          int n_cols,
-                                          InType MAX_VALUE)
-{
-  // place holder function
-  // so that compiler unrolls for all template types successfully
-}
-
-// helper function to layout values (index's) for key-value sort
-template <typename OutType>
-cudaError_t layoutIdx(OutType* in, int n_rows, int n_columns, cudaStream_t stream)
-{
-  int totalElements = n_rows * n_columns;
-  dim3 block(256);
-  dim3 grid((totalElements + block.x - 1) / block.x);
-  devLayoutIdx<OutType><<<grid, block, 0, stream>>>(in, n_columns, totalElements);
-  return cudaGetLastError();
-}
-
-// helper function to layout offsets for rows for DeviceSegmentedRadixSort
-template <typename T>
-cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream)
-{
-  dim3 block(128);
-  dim3 grid((n_times + block.x - 1) / block.x);
-  devOffsetKernel<T><<<grid, block, 0, stream>>>(in, value, n_times);
-  return cudaGetLastError();
-}
-
-/**
- * @brief sort columns within each row of row-major input matrix and return sorted indexes
- * modelled as key-value sort with key being input matrix and value being index of values
- * @param in: input matrix
- * @param out: output value(index) matrix
- * @param n_rows: number rows of input matrix
- * @param n_cols: number columns of input matrix
- * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize
- * @param workspacePtr: pointer to workspace memory
- * @param workspaceSize: Size of workspace to be allocated
- * @param stream: cuda stream to execute prim on
- * @param sortedKeys: Optional, output matrix for sorted keys (input)
- */
-template <typename InType, typename OutType>
-void sortColumnsPerRow(const InType* in,
-                       OutType* out,
-                       int n_rows,
-                       int n_columns,
-                       bool& bAllocWorkspace,
-                       void* workspacePtr,
-                       size_t& workspaceSize,
-                       cudaStream_t stream,
-                       InType* sortedKeys = nullptr)
-{
-  // assume non-square row-major matrices
-  // current use-case: KNN, trustworthiness scores
-  // output : either sorted indices or sorted indices and input values
-  // future : this prim can be modified to be more generic and serve as a way to sort column entries
-  // per row
-  //          i.e. another output format: sorted values only
-
-  int totalElements          = n_rows * n_columns;
-  size_t perElementSmemUsage = sizeof(InType) + sizeof(OutType);
-  size_t memAlignWidth       = 256;
-
-  // @ToDo: Figure out dynamic shared memory for block sort kernel - better for volta and beyond
-  // int currDevice = 0, smemLimit = 0;
-  // RAFT_CUDA_TRY(cudaGetDevice(&currDevice));
-  // RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemLimit, cudaDevAttrMaxSharedMemoryPerBlock,
-  // currDevice)); size_t maxElementsForBlockSort = smemLimit / perElementSmemUsage;
-
-  // for 48KB smem/block, can fit in 6144 4byte key-value pair
-  // assuming key-value sort for now - smem computation will change for value only sort
-  // dtype being size of key-value pair
-  std::map<size_t, int> dtypeToColumnMap = {{4, 12288},   // short + short
-                                            {8, 12288},   // float/int + int/float
-                                            {12, 6144},   // double + int/float
-                                            {16, 6144}};  // double + double
-
-  if (dtypeToColumnMap.count(perElementSmemUsage) != 0 &&
-      n_columns <= dtypeToColumnMap[perElementSmemUsage]) {
-    // more elements per thread --> more register pressure
-    // 512(blockSize) * 8 elements per thread = 71 register / thread
-
-    // instantiate some kernel combinations
-    if (n_columns <= 512)
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream);
-    else if (n_columns > 512 && n_columns <= 1024)
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 8, stream);
-    else if (n_columns > 1024 && n_columns <= 3072)
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 6, stream);
-    else if (n_columns > 3072 && n_columns <= 4096)
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 8, stream);
-    else if (n_columns > 4096 && n_columns <= 6144)
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 12, stream);
-    else
-      INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 1024, 12, stream);
-  } else if (n_columns <= (1 << 18) && n_rows > 1) {
-    // device Segmented radix sort
-    // 2^18 column cap to restrict size of workspace ~512 MB
-    // will give better perf than below deviceWide Sort for even larger dims
-    int numSegments = n_rows + 1;
-
-    // need auxillary storage: cub sorting + keys (if user not passing) +
-    // staging for values out + segment partition
-    if (workspaceSize == 0 || !workspacePtr) {
-      OutType* tmpValIn    = nullptr;
-      int* tmpOffsetBuffer = nullptr;
-
-      // first call is to get size of workspace
-      RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr,
-                                                             workspaceSize,
-                                                             in,
-                                                             sortedKeys,
-                                                             tmpValIn,
-                                                             out,
-                                                             totalElements,
-                                                             numSegments,
-                                                             tmpOffsetBuffer,
-                                                             tmpOffsetBuffer + 1));
-      bAllocWorkspace = true;
-      // more staging space for temp output of keys
-      if (!sortedKeys)
-        workspaceSize += raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth);
-
-      // value in KV pair need to be passed in, out buffer is separate
-      workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth);
-
-      // for segment offsets
-      workspaceSize += raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth);
-    } else {
-      size_t workspaceOffset = 0;
-
-      if (!sortedKeys) {
-        sortedKeys      = reinterpret_cast<InType*>(workspacePtr);
-        workspaceOffset = raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth);
-        workspacePtr    = (void*)((size_t)workspacePtr + workspaceOffset);
-      }
-
-      OutType* dValuesIn = reinterpret_cast<OutType*>(workspacePtr);
-      workspaceOffset    = raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth);
-      workspacePtr       = (void*)((size_t)workspacePtr + workspaceOffset);
-
-      int* dSegmentOffsets = reinterpret_cast<int*>(workspacePtr);
-      workspaceOffset      = raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth);
-      workspacePtr         = (void*)((size_t)workspacePtr + workspaceOffset);
-
-      // layout idx
-      RAFT_CUDA_TRY(layoutIdx(dValuesIn, n_rows, n_columns, stream));
-
-      // layout segment lengths - spread out column length
-      RAFT_CUDA_TRY(layoutSortOffset(dSegmentOffsets, n_columns, numSegments, stream));
-
-      RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr,
-                                                             workspaceSize,
-                                                             in,
-                                                             sortedKeys,
-                                                             dValuesIn,
-                                                             out,
-                                                             totalElements,
-                                                             numSegments,
-                                                             dSegmentOffsets,
-                                                             dSegmentOffsets + 1,
-                                                             0,
-                                                             sizeof(InType) * 8,
-                                                             stream));
-    }
-  } else {
-    // batched per row device wide sort
-    if (workspaceSize == 0 || !workspacePtr) {
-      OutType* tmpValIn = nullptr;
-
-      // first call is to get size of workspace
-      RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
-        workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out, n_columns));
-      bAllocWorkspace = true;
-
-      if (!sortedKeys)
-        workspaceSize += raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth);
-
-      workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
-    } else {
-      size_t workspaceOffset   = 0;
-      bool userKeyOutputBuffer = true;
-
-      if (!sortedKeys) {
-        userKeyOutputBuffer = false;
-        sortedKeys          = reinterpret_cast<InType*>(workspacePtr);
-        workspaceOffset     = raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth);
-        workspacePtr        = (void*)((size_t)workspacePtr + workspaceOffset);
-      }
-
-      OutType* dValuesIn = reinterpret_cast<OutType*>(workspacePtr);
-      workspaceOffset    = raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth);
-      workspacePtr       = (void*)((size_t)workspacePtr + workspaceOffset);
-
-      // layout idx
-      RAFT_CUDA_TRY(layoutIdx(dValuesIn, 1, n_columns, stream));
-
-      for (int i = 0; i < n_rows; i++) {
-        InType* rowIn =
-          reinterpret_cast<InType*>((size_t)in + (i * sizeof(InType) * (size_t)n_columns));
-        OutType* rowOut =
-          reinterpret_cast<OutType*>((size_t)out + (i * sizeof(OutType) * (size_t)n_columns));
-
-        RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(
-          workspacePtr, workspaceSize, rowIn, sortedKeys, dValuesIn, rowOut, n_columns));
-
-        if (userKeyOutputBuffer)
-          sortedKeys =
-            reinterpret_cast<InType*>((size_t)sortedKeys + sizeof(InType) * (size_t)n_columns);
-      }
-    }
-  }
-}
-};  // end namespace Selection
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/selection/haversine_knn.cuh b/cpp/src_prims/selection/haversine_knn.cuh
deleted file mode 100644
index c531c3577b..0000000000
--- a/cpp/src_prims/selection/haversine_knn.cuh
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/cuda_utils.cuh>
-
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-
-namespace raft {
-namespace selection {
-
-template <typename value_t>
-DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2)
-{
-  value_t sin_0 = sin(0.5 * (x1 - y1));
-  value_t sin_1 = sin(0.5 * (x2 - y2));
-  value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1;
-
-  return 2 * asin(sqrt(rdist));
-}
-
-/**
- * @tparam value_idx data type of indices
- * @tparam value_t data type of values and distances
- * @tparam warp_q
- * @tparam thread_q
- * @tparam tpb
- * @param[out] out_inds output indices
- * @param[out] out_dists output distances
- * @param[in] index index array
- * @param[in] query query array
- * @param[in] n_index_rows number of rows in index array
- * @param[in] k number of closest neighbors to return
- */
-template <typename value_idx, typename value_t, int warp_q = 1024, int thread_q = 8, int tpb = 128>
-__global__ void haversine_knn_kernel(value_idx* out_inds,
-                                     value_t* out_dists,
-                                     const value_t* index,
-                                     const value_t* query,
-                                     size_t n_index_rows,
-                                     int k)
-{
-  constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize;
-
-  __shared__ value_t smemK[kNumWarps * warp_q];
-  __shared__ value_idx smemV[kNumWarps * warp_q];
-
-  faiss::gpu::
-    BlockSelect<value_t, value_idx, false, faiss::gpu::Comparator<value_t>, warp_q, thread_q, tpb>
-      heap(faiss::gpu::Limits<value_t>::getMax(), -1, smemK, smemV, k);
-
-  // Grid is exactly sized to rows available
-  int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize);
-
-  const value_t* query_ptr = query + (blockIdx.x * 2);
-  value_t x1               = query_ptr[0];
-  value_t x2               = query_ptr[1];
-
-  int i = threadIdx.x;
-
-  for (; i < limit; i += tpb) {
-    const value_t* idx_ptr = index + (i * 2);
-    value_t y1             = idx_ptr[0];
-    value_t y2             = idx_ptr[1];
-
-    value_t dist = compute_haversine(x1, y1, x2, y2);
-
-    heap.add(dist, i);
-  }
-
-  // Handle last remainder fraction of a warp of elements
-  if (i < n_index_rows) {
-    const value_t* idx_ptr = index + (i * 2);
-    value_t y1             = idx_ptr[0];
-    value_t y2             = idx_ptr[1];
-
-    value_t dist = compute_haversine(x1, y1, x2, y2);
-
-    heap.addThreadQ(dist, i);
-  }
-
-  heap.reduce();
-
-  for (int i = threadIdx.x; i < k; i += tpb) {
-    out_dists[blockIdx.x * k + i] = smemK[i];
-    out_inds[blockIdx.x * k + i]  = smemV[i];
-  }
-}
-
-/**
- * Conmpute the k-nearest neighbors using the Haversine
- * (great circle arc) distance. Input is assumed to have
- * 2 dimensions (latitude, longitude) in radians.
-
- * @tparam value_idx
- * @tparam value_t
- * @param[out] out_inds output indices array on device (size n_query_rows * k)
- * @param[out] out_dists output dists array on device (size n_query_rows * k)
- * @param[in] index input index array on device (size n_index_rows * 2)
- * @param[in] query input query array on device (size n_query_rows * 2)
- * @param[in] n_index_rows number of rows in index array
- * @param[in] n_query_rows number of rows in query array
- * @param[in] k number of closest neighbors to return
- * @param[in] stream stream to order kernel launch
- */
-template <typename value_idx, typename value_t>
-void haversine_knn(value_idx* out_inds,
-                   value_t* out_dists,
-                   const value_t* index,
-                   const value_t* query,
-                   size_t n_index_rows,
-                   size_t n_query_rows,
-                   int k,
-                   cudaStream_t stream)
-{
-  haversine_knn_kernel<<<n_query_rows, 128, 0, stream>>>(
-    out_inds, out_dists, index, query, n_index_rows, k);
-}
-
-};  // namespace selection
-};  // namespace raft
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
deleted file mode 100644
index bdba95d082..0000000000
--- a/cpp/src_prims/selection/knn.cuh
+++ /dev/null
@@ -1,348 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "haversine_knn.cuh"
-#include "processing.cuh"
-
-#include <label/classlabels.cuh>
-
-#include <cuml/neighbors/knn.hpp>
-
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/distance/distance_type.hpp>
-#include <raft/mr/device/allocator.hpp>
-
-#include <faiss/gpu/GpuDistance.h>
-#include <faiss/gpu/GpuIndexFlat.h>
-#include <faiss/gpu/GpuIndexIVFFlat.h>
-#include <faiss/gpu/GpuIndexIVFPQ.h>
-#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/StandardGpuResources.h>
-#include <faiss/gpu/utils/Limits.cuh>
-#include <faiss/gpu/utils/Select.cuh>
-#include <faiss/gpu/utils/Tensor.cuh>
-#include <faiss/utils/Heap.h>
-
-#include <thrust/device_vector.h>
-#include <thrust/iterator/transform_iterator.h>
-
-#include <cstddef>
-#include <iostream>
-#include <set>
-
-namespace MLCommon {
-namespace Selection {
-
-template <bool precomp_lbls, typename T>
-inline __device__ T get_lbls(const T* labels, const int64_t* knn_indices, int64_t idx)
-{
-  if (precomp_lbls) {
-    return labels[idx];
-  } else {
-    int64_t neighbor_idx = knn_indices[idx];
-    return labels[neighbor_idx];
-  }
-}
-
-template <typename OutType = float, bool precomp_lbls = false>
-__global__ void class_probs_kernel(OutType* out,
-                                   const int64_t* knn_indices,
-                                   const int* labels,
-                                   int n_uniq_labels,
-                                   std::size_t n_samples,
-                                   int n_neighbors)
-{
-  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int i   = row * n_neighbors;
-
-  float n_neigh_inv = 1.0f / n_neighbors;
-
-  if (row >= n_samples) return;
-
-  for (int j = 0; j < n_neighbors; j++) {
-    int out_label = get_lbls<precomp_lbls>(labels, knn_indices, i + j);
-    int out_idx   = row * n_uniq_labels + out_label;
-    out[out_idx] += n_neigh_inv;
-  }
-}
-
-template <typename OutType = int>
-__global__ void class_vote_kernel(OutType* out,
-                                  const float* class_proba,
-                                  int* unique_labels,
-                                  int n_uniq_labels,
-                                  std::size_t n_samples,
-                                  int n_outputs,
-                                  int output_offset,
-                                  bool use_shared_mem)
-{
-  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int i   = row * n_uniq_labels;
-
-  extern __shared__ int label_cache[];
-  if (use_shared_mem) {
-    for (int j = threadIdx.x; j < n_uniq_labels; j += blockDim.x) {
-      label_cache[j] = unique_labels[j];
-    }
-
-    __syncthreads();
-  }
-
-  if (row >= n_samples) return;
-  float cur_max = -1.0;
-  int cur_label = -1;
-  for (int j = 0; j < n_uniq_labels; j++) {
-    float cur_proba = class_proba[i + j];
-    if (cur_proba > cur_max) {
-      cur_max   = cur_proba;
-      cur_label = j;
-    }
-  }
-
-  int val = use_shared_mem ? label_cache[cur_label] : unique_labels[cur_label];
-
-  out[row * n_outputs + output_offset] = val;
-}
-
-template <typename LabelType, bool precomp_lbls = false>
-__global__ void regress_avg_kernel(LabelType* out,
-                                   const int64_t* knn_indices,
-                                   const LabelType* labels,
-                                   std::size_t n_samples,
-                                   int n_neighbors,
-                                   int n_outputs,
-                                   int output_offset)
-{
-  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
-  int i   = row * n_neighbors;
-
-  if (row >= n_samples) return;
-
-  LabelType pred = 0;
-  for (int j = 0; j < n_neighbors; j++) {
-    pred += get_lbls<precomp_lbls>(labels, knn_indices, i + j);
-  }
-
-  out[row * n_outputs + output_offset] = pred / (LabelType)n_neighbors;
-}
-
-/**
- * A naive knn classifier to predict probabilities
- * @tparam TPB_X number of threads per block to use. each thread
- *               will process a single row of knn_indices
- * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Classifier. In this case,
- *         the knn_indices array is not used as the y arrays already store the labels for each row.
- *         This makes it possible to compute the reduction step without holding all the data on a
- * single machine.
- * @param[out] out vector of output class probabilities of the same size as y.
- *            each element should be of size size (n_samples * n_classes[i])
- * @param[in] knn_indices the index array resulting from a knn search
- * @param[in] y vector of label arrays. for multulabel classification,
- *          each output in the vector is a different array of labels
- *          corresponding to the i'th output.
- * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
- * @param[in] n_query_rows number of rows in knn_indices
- * @param[in] k number of neighbors in knn_indices
- * @param[in] uniq_labels vector of the sorted unique labels for each array in y
- * @param[in] n_unique vector of sizes for each array in uniq_labels
- * @param[in] user_stream main stream to use for queuing isolated CUDA events
- * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
- * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
- *        the user_stream is used.
- */
-template <int TPB_X = 32, bool precomp_lbls = false>
-void class_probs(const raft::handle_t& handle,
-                 std::vector<float*>& out,
-                 const int64_t* knn_indices,
-                 std::vector<int*>& y,
-                 std::size_t n_index_rows,
-                 std::size_t n_query_rows,
-                 int k,
-                 std::vector<int*>& uniq_labels,
-                 std::vector<int>& n_unique)
-{
-  for (std::size_t i = 0; i < y.size(); i++) {
-    cudaStream_t stream = handle.get_next_usable_stream();
-
-    int n_unique_labels = n_unique[i];
-    size_t cur_size     = n_query_rows * n_unique_labels;
-
-    RAFT_CUDA_TRY(cudaMemsetAsync(out[i], 0, cur_size * sizeof(float), stream));
-
-    dim3 grid(raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), 1, 1);
-    dim3 blk(TPB_X, 1, 1);
-
-    /**
-     * Build array of class probability arrays from
-     * knn_indices and labels
-     */
-    rmm::device_uvector<int> y_normalized(n_index_rows + n_unique_labels, stream);
-
-    /*
-     * Appending the array of unique labels to the original labels array
-     * to prevent make_monotonic function from producing misleading results
-     * due to the absence of some of the unique labels in the labels array
-     */
-    rmm::device_uvector<int> y_tmp(n_index_rows + n_unique_labels, stream);
-    raft::update_device(y_tmp.data(), y[i], n_index_rows, stream);
-    raft::update_device(y_tmp.data() + n_index_rows, uniq_labels[i], n_unique_labels, stream);
-
-    MLCommon::Label::make_monotonic(y_normalized.data(), y_tmp.data(), y_tmp.size(), stream);
-    raft::linalg::unaryOp<int>(
-      y_normalized.data(),
-      y_normalized.data(),
-      n_index_rows,
-      [] __device__(int input) { return input - 1; },
-      stream);
-    class_probs_kernel<float, precomp_lbls><<<grid, blk, 0, stream>>>(
-      out[i], knn_indices, y_normalized.data(), n_unique_labels, n_query_rows, k);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-}
-
-/**
- * KNN classifier using voting based on the statistical mode of classes.
- * In the event of a tie, the class with the lowest index in the sorted
- * array of unique monotonically increasing labels will be used.
- *
- * @tparam TPB_X the number of threads per block to use
- * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Classifier. In this case,
- * the knn_indices array is not used as the y arrays already store the labels for each row.
- * This makes it possible to compute the reduction step without holding all the data on a single
- * machine.
- * @param[out] out output array of size (n_samples * y.size())
- * @param[in] knn_indices index array from knn search
- * @param[in] y vector of label arrays. for multilabel classification, each
- *          element in the vector is a different "output" array of labels corresponding
- *          to the i'th output.
- * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
- * @param[in] n_query_rows number of rows in knn_indices
- * @param[in] k number of neighbors in knn_indices
- * @param[in] uniq_labels vector of the sorted unique labels for each array in y
- * @param[in] n_unique vector of sizes for each array in uniq_labels
- * @param[in] user_stream main stream to use for queuing isolated CUDA events
- * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
- * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
- *        the user_stream is used.
- */
-template <int TPB_X = 32, bool precomp_lbls = false>
-void knn_classify(const raft::handle_t& handle,
-                  int* out,
-                  const int64_t* knn_indices,
-                  std::vector<int*>& y,
-                  std::size_t n_index_rows,
-                  std::size_t n_query_rows,
-                  int k,
-                  std::vector<int*>& uniq_labels,
-                  std::vector<int>& n_unique)
-{
-  std::vector<float*> probs;
-  std::vector<rmm::device_uvector<float>> tmp_probs;
-
-  // allocate temporary memory
-  for (std::size_t i = 0; i < n_unique.size(); i++) {
-    int size = n_unique[i];
-
-    cudaStream_t stream = handle.get_next_usable_stream(i);
-
-    tmp_probs.emplace_back(n_query_rows * size, stream);
-    probs.push_back(tmp_probs.back().data());
-  }
-
-  /**
-   * Compute class probabilities
-   *
-   * Note: Since class_probs will use the same round robin strategy for distributing
-   * work to the streams, we don't need to explicitly synchronize the streams here.
-   */
-  class_probs<32, precomp_lbls>(
-    handle, probs, knn_indices, y, n_index_rows, n_query_rows, k, uniq_labels, n_unique);
-
-  dim3 grid(raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), 1, 1);
-  dim3 blk(TPB_X, 1, 1);
-
-  for (std::size_t i = 0; i < y.size(); i++) {
-    cudaStream_t stream = handle.get_next_usable_stream(i);
-
-    int n_unique_labels = n_unique[i];
-
-    /**
-     * Choose max probability
-     */
-    // Use shared memory for label lookups if the number of classes is small enough
-    int smem            = sizeof(int) * n_unique_labels;
-    bool use_shared_mem = smem < raft::getSharedMemPerBlock();
-
-    class_vote_kernel<<<grid, blk, use_shared_mem ? smem : 0, stream>>>(
-      out, probs[i], uniq_labels[i], n_unique_labels, n_query_rows, y.size(), i, use_shared_mem);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-}
-
-/**
- * KNN regression using voting based on the mean of the labels for the
- * nearest neighbors.
- * @tparam ValType data type of the labels
- * @tparam TPB_X the number of threads per block to use
- * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Regressor. In this case,
- * the knn_indices array is not used as the y arrays already store the output for each row.
- * This makes it possible to compute the reduction step without holding all the data on a single
- * machine.
- * @param[out] out output array of size (n_samples * y.size())
- * @param[in] knn_indices index array from knn search
- * @param[in] y vector of label arrays. for multilabel classification, each
- *          element in the vector is a different "output" array of labels corresponding
- *          to the i'th output.
- * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
- * @param[in] n_query_rows number of rows in knn_indices
- * @param[in] k number of neighbors in knn_indices
- * @param[in] user_stream main stream to use for queuing isolated CUDA events
- * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
- * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
- *        the user_stream is used.
- */
-
-template <typename ValType, int TPB_X = 32, bool precomp_lbls = false>
-void knn_regress(const raft::handle_t& handle,
-                 ValType* out,
-                 const int64_t* knn_indices,
-                 const std::vector<ValType*>& y,
-                 size_t n_index_rows,
-                 size_t n_query_rows,
-                 int k)
-{
-  /**
-   * Vote average regression value
-   */
-  for (std::size_t i = 0; i < y.size(); i++) {
-    cudaStream_t stream = handle.get_next_usable_stream();
-
-    regress_avg_kernel<ValType, precomp_lbls>
-      <<<raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), TPB_X, 0, stream>>>(
-        out, knn_indices, y[i], n_query_rows, k, y.size(), i);
-
-    handle.sync_stream(stream);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-}
-
-};  // namespace Selection
-};  // namespace MLCommon
diff --git a/cpp/src_prims/selection/processing.cuh b/cpp/src_prims/selection/processing.cuh
deleted file mode 100644
index b559efda45..0000000000
--- a/cpp/src_prims/selection/processing.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuml/neighbors/knn.hpp>
-
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/linalg/unary_op.hpp>
-
-#include <raft/stats/mean.hpp>
-#include <raft/stats/mean_center.hpp>
-
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Selection {
-
-/**
- * @brief A virtual class defining pre- and post-processing
- * for metrics. This class will temporarily modify its given
- * state in `preprocess()` and undo those modifications in
- * `postprocess()`
- */
-
-template <typename math_t>
-class MetricProcessor {
- public:
-  virtual void preprocess(math_t* data) {}
-
-  virtual void revert(math_t* data) {}
-
-  virtual void postprocess(math_t* data) {}
-
-  virtual ~MetricProcessor() = default;
-};
-
-template <typename math_t>
-class CosineMetricProcessor : public MetricProcessor<math_t> {
- protected:
-  int k_;
-  bool row_major_;
-  size_t n_rows_;
-  size_t n_cols_;
-  cudaStream_t stream_;
-  rmm::device_uvector<math_t> colsums_;
-
- public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : stream_(stream),
-      colsums_(n_rows, stream),
-      n_cols_(n_cols),
-      n_rows_(n_rows),
-      row_major_(row_major),
-      k_(k)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    raft::linalg::rowNorm(colsums_.data(),
-                          data,
-                          n_cols_,
-                          n_rows_,
-                          raft::linalg::NormType::L2Norm,
-                          row_major_,
-                          stream_,
-                          [] __device__(math_t in) { return sqrtf(in); });
-
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
-      stream_);
-  }
-
-  void revert(math_t* data)
-  {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
-      stream_);
-  }
-
-  void postprocess(math_t* data)
-  {
-    raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
-  }
-
-  ~CosineMetricProcessor() = default;
-};
-
-template <typename math_t>
-class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
-  using cosine = CosineMetricProcessor<math_t>;
-
- public:
-  CorrelationMetricProcessor(
-    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
-
-    raft::linalg::reduce(means_.data(),
-                         data,
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         (math_t)0.0,
-                         cosine::row_major_,
-                         true,
-                         cosine::stream_);
-
-    raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
-      [=] __device__(math_t in) { return in * normalizer_const; },
-      cosine::stream_);
-
-    raft::stats::meanCenter(data,
-                            data,
-                            means_.data(),
-                            cosine::n_cols_,
-                            cosine::n_rows_,
-                            cosine::row_major_,
-                            false,
-                            cosine::stream_);
-
-    CosineMetricProcessor<math_t>::preprocess(data);
-  }
-
-  void revert(math_t* data)
-  {
-    CosineMetricProcessor<math_t>::revert(data);
-
-    raft::stats::meanAdd(data,
-                         data,
-                         means_.data(),
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         cosine::row_major_,
-                         false,
-                         cosine::stream_);
-  }
-
-  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
-
-  ~CorrelationMetricProcessor() = default;
-
-  rmm::device_uvector<math_t> means_;
-};
-
-template <typename math_t>
-class DefaultMetricProcessor : public MetricProcessor<math_t> {
- public:
-  void preprocess(math_t* data) {}
-
-  void revert(math_t* data) {}
-
-  void postprocess(math_t* data) {}
-
-  ~DefaultMetricProcessor() = default;
-};
-
-template <typename math_t>
-inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  raft::distance::DistanceType metric,
-  int n,
-  int D,
-  int k,
-  bool rowMajorQuery,
-  cudaStream_t userStream)
-{
-  MetricProcessor<math_t>* mp = nullptr;
-
-  switch (metric) {
-    case raft::distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-
-    case raft::distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-    default: mp = new DefaultMetricProcessor<math_t>();
-  }
-
-  return std::unique_ptr<MetricProcessor<math_t>>(mp);
-}
-
-// Currently only being used by floats
-template class MetricProcessor<float>;
-template class CosineMetricProcessor<float>;
-template class CorrelationMetricProcessor<float>;
-template class DefaultMetricProcessor<float>;
-
-};  // namespace Selection
-};  // namespace MLCommon
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index d980468e7b..05f31c79dc 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -168,31 +168,24 @@ if(BUILD_PRIMS_TESTS)
   # (please keep the filenames in alphabetical order)
   add_executable(${PRIMS_TEST_TARGET}
     prims/add_sub_dev_scalar.cu
-    prims/adjusted_rand_index.cu
-    prims/batched/csr.cu
+          prims/batched/csr.cu
     prims/batched/gemv.cu
     prims/batched/information_criterion.cu
     prims/batched/make_symm.cu
     prims/batched/matrix.cu
     prims/cache.cu
     prims/columnSort.cu
-    prims/completeness_score.cu
-    prims/contingencyMatrix.cu
-    prims/decoupled_lookback.cu
+          prims/decoupled_lookback.cu
     prims/device_utils.cu
-    prims/dispersion.cu
-    prims/eltwise2d.cu
-    prims/entropy.cu
-    prims/fast_int_div.cu
+          prims/eltwise2d.cu
+          prims/fast_int_div.cu
     prims/fillna.cu
     prims/gather.cu
     prims/gram.cu
     prims/grid_sync.cu
     prims/hinge.cu
-    prims/homogeneity_score.cu
-    prims/jones_transform.cu
-    prims/kl_divergence.cu
-    prims/knn_classify.cu
+          prims/jones_transform.cu
+          prims/knn_classify.cu
     prims/knn_regression.cu
     prims/kselection.cu
     prims/label.cu
@@ -202,15 +195,9 @@ if(BUILD_PRIMS_TESTS)
     prims/logisticReg.cu
     prims/make_arima.cu
     prims/merge_labels.cu
-    prims/mutual_info_score.cu
-    prims/penalty.cu
-    prims/rand_index.cu
-    prims/reverse.cu
-    prims/score.cu
-    prims/sigmoid.cu
-    prims/silhouette_score.cu
-    prims/trustworthiness.cu
-    prims/v_measure.cu
+          prims/penalty.cu
+          prims/reverse.cu
+          prims/sigmoid.cu
           )
 
   target_compile_options(${PRIMS_TEST_TARGET}
diff --git a/cpp/test/prims/adjusted_rand_index.cu b/cpp/test/prims/adjusted_rand_index.cu
deleted file mode 100644
index 4958ce5d00..0000000000
--- a/cpp/test/prims/adjusted_rand_index.cu
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/adjusted_rand_index.cuh>
-#include <metrics/contingencyMatrix.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-struct adjustedRandIndexParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  bool sameArrays;
-  double tolerance;
-  // if this is true, then it is assumed that `sameArrays` is also true
-  // further it also assumes `lowerLabelRange` and `upperLabelRange` are 0
-  bool testZeroArray;
-};
-
-template <typename T, typename MathT = int>
-class adjustedRandIndexTest : public ::testing::TestWithParam<adjustedRandIndexParam> {
- protected:
-  adjustedRandIndexTest() : firstClusterArray(0, stream), secondClusterArray(0, stream) {}
-
-  void SetUp() override
-  {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    params    = ::testing::TestWithParam<adjustedRandIndexParam>::GetParam();
-    nElements = params.nElements;
-
-    firstClusterArray.resize(nElements, stream);
-    secondClusterArray.resize(nElements, stream);
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
-
-    if (!params.testZeroArray) {
-      SetUpDifferentArrays();
-    } else {
-      SetupZeroArray();
-    }
-    // allocating and initializing memory to the GPU
-    computed_adjusted_rand_index = compute_adjusted_rand_index<T, MathT>(
-      firstClusterArray.data(), secondClusterArray.data(), nElements, stream);
-  }
-
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  void SetUpDifferentArrays()
-  {
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-    std::vector<int> arr1(nElements, 0);
-    std::vector<int> arr2(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    if (params.sameArrays) {
-      arr2 = arr1;
-    } else {
-      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-    }
-    // calculating golden output
-    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
-    size_t sizeOfMat     = numUniqueClasses * numUniqueClasses * sizeof(int);
-    int* hGoldenOutput   = (int*)malloc(sizeOfMat);
-    memset(hGoldenOutput, 0, sizeOfMat);
-    for (int i = 0; i < nElements; i++) {
-      int row    = arr1[i] - lowerLabelRange;
-      int column = arr2[i] - lowerLabelRange;
-      hGoldenOutput[row * numUniqueClasses + column] += 1;
-    }
-    int sumOfNijCTwo = 0;
-    int* a           = (int*)malloc(numUniqueClasses * sizeof(int));
-    int* b           = (int*)malloc(numUniqueClasses * sizeof(int));
-    memset(a, 0, numUniqueClasses * sizeof(int));
-    memset(b, 0, numUniqueClasses * sizeof(int));
-    int sumOfAiCTwo = 0;
-    int sumOfBiCTwo = 0;
-    // calculating the sum of number of pairwise points in each index
-    // and also the reducing contingency matrix along row and column
-    for (int i = 0; i < numUniqueClasses; ++i) {
-      for (int j = 0; j < numUniqueClasses; ++j) {
-        int Nij = hGoldenOutput[i * numUniqueClasses + j];
-        sumOfNijCTwo += ((Nij) * (Nij - 1)) / 2;
-        a[i] += hGoldenOutput[i * numUniqueClasses + j];
-        b[i] += hGoldenOutput[j * numUniqueClasses + i];
-      }
-    }
-    // claculating the sum of number pairwise points in ever column sum
-    // claculating the sum of number pairwise points in ever row sum
-    for (int i = 0; i < numUniqueClasses; ++i) {
-      sumOfAiCTwo += ((a[i]) * (a[i] - 1)) / 2;
-      sumOfBiCTwo += ((b[i]) * (b[i] - 1)) / 2;
-    }
-    // calculating the ARI
-    double nCTwo         = double(nElements) * double(nElements - 1) / 2.0;
-    double expectedIndex = (double(sumOfBiCTwo) * double(sumOfAiCTwo)) / double(nCTwo);
-    double maxIndex      = (double(sumOfAiCTwo) + double(sumOfBiCTwo)) / 2.0;
-    double index         = (double)sumOfNijCTwo;
-    if (maxIndex - expectedIndex)
-      truth_adjusted_rand_index = (index - expectedIndex) / (maxIndex - expectedIndex);
-    else
-      truth_adjusted_rand_index = 0;
-    raft::update_device(firstClusterArray.data(), &arr1[0], nElements, stream);
-    raft::update_device(secondClusterArray.data(), &arr2[0], nElements, stream);
-  }
-
-  void SetupZeroArray()
-  {
-    lowerLabelRange           = 0;
-    upperLabelRange           = 0;
-    truth_adjusted_rand_index = 1.0;
-  }
-
-  adjustedRandIndexParam params;
-  T lowerLabelRange, upperLabelRange;
-  rmm::device_uvector<T> firstClusterArray;
-  rmm::device_uvector<T> secondClusterArray;
-  int nElements                       = 0;
-  double truth_adjusted_rand_index    = 0;
-  double computed_adjusted_rand_index = 0;
-  cudaStream_t stream                 = 0;
-};
-
-const std::vector<adjustedRandIndexParam> inputs = {
-  {199, 1, 10, false, 0.000001, false},
-  {200, 15, 100, false, 0.000001, false},
-  {100, 1, 20, false, 0.000001, false},
-  {10, 1, 10, false, 0.000001, false},
-  {198, 1, 100, false, 0.000001, false},
-  {300, 3, 99, false, 0.000001, false},
-  {199, 1, 10, true, 0.000001, false},
-  {200, 15, 100, true, 0.000001, false},
-  {100, 1, 20, true, 0.000001, false},
-  // FIXME: disabled temporarily due to flaky test
-  // {10, 1, 10, true, 0.000001, false},
-  {198, 1, 100, true, 0.000001, false},
-  {300, 3, 99, true, 0.000001, false},
-
-  {199, 0, 0, false, 0.000001, true},
-  {200, 0, 0, false, 0.000001, true},
-  {100, 0, 0, false, 0.000001, true},
-  {10, 0, 0, false, 0.000001, true},
-  {198, 0, 0, false, 0.000001, true},
-  {300, 0, 0, false, 0.000001, true},
-  {199, 0, 0, true, 0.000001, true},
-  {200, 0, 0, true, 0.000001, true},
-  {100, 0, 0, true, 0.000001, true},
-  {10, 0, 0, true, 0.000001, true},
-  {198, 0, 0, true, 0.000001, true},
-  {300, 0, 0, true, 0.000001, true},
-};
-
-const std::vector<adjustedRandIndexParam> large_inputs = {
-  {2000000, 1, 1000, false, 0.000001, false},
-  {2000000, 1, 1000, true, 0.000001, false},
-
-  {2000000, 0, 0, false, 0.000001, true},
-  {2000000, 0, 0, true, 0.000001, true},
-};
-
-typedef adjustedRandIndexTest<int, int> ARI_ii;
-TEST_P(ARI_ii, Result)
-{
-  ASSERT_NEAR(computed_adjusted_rand_index, truth_adjusted_rand_index, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(adjusted_rand_index, ARI_ii, ::testing::ValuesIn(inputs));
-
-typedef adjustedRandIndexTest<int, unsigned long long> ARI_il;
-TEST_P(ARI_il, Result)
-{
-  ASSERT_NEAR(computed_adjusted_rand_index, truth_adjusted_rand_index, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(adjusted_rand_index, ARI_il, ::testing::ValuesIn(inputs));
-INSTANTIATE_TEST_CASE_P(adjusted_rand_index_large, ARI_il, ::testing::ValuesIn(large_inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/completeness_score.cu b/cpp/test/prims/completeness_score.cu
deleted file mode 100644
index aa32114082..0000000000
--- a/cpp/test/prims/completeness_score.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/completeness_score.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct completenessParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  bool sameArrays;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class completenessTest : public ::testing::TestWithParam<completenessParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<completenessParam>::GetParam();
-
-    nElements       = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(nElements, 0);
-    std::vector<int> arr2(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    if (params.sameArrays) {
-      arr2 = arr1;
-    } else {
-      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-    }
-
-    // allocating and initializing memory to the GPU
-
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> truthClusterArray(nElements, stream);
-    rmm::device_uvector<T> predClusterArray(nElements, stream);
-    raft::update_device(truthClusterArray.data(), arr1.data(), (int)nElements, stream);
-    raft::update_device(predClusterArray.data(), arr2.data(), (int)nElements, stream);
-
-    // calculating the golden output
-    double truthMI, truthEntropy;
-
-    truthMI      = MLCommon::Metrics::mutual_info_score(truthClusterArray.data(),
-                                                   predClusterArray.data(),
-                                                   nElements,
-                                                   lowerLabelRange,
-                                                   upperLabelRange,
-                                                   stream);
-    truthEntropy = MLCommon::Metrics::entropy(
-      predClusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
-
-    if (truthEntropy) {
-      truthCompleteness = truthMI / truthEntropy;
-    } else
-      truthCompleteness = 1.0;
-
-    if (nElements == 0) truthCompleteness = 1.0;
-
-    // calling the completeness CUDA implementation
-    computedCompleteness = MLCommon::Metrics::completeness_score(truthClusterArray.data(),
-                                                                 predClusterArray.data(),
-                                                                 nElements,
-                                                                 lowerLabelRange,
-                                                                 upperLabelRange,
-                                                                 stream);
-  }
-
-  // the destructor
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  // declaring the data values
-  completenessParam params;
-  T lowerLabelRange, upperLabelRange;
-  int nElements               = 0;
-  double truthCompleteness    = 0;
-  double computedCompleteness = 0;
-  cudaStream_t stream         = 0;
-};
-
-// setting test parameter values
-const std::vector<completenessParam> inputs = {{199, 1, 10, false, 0.000001},
-                                               {200, 15, 100, false, 0.000001},
-                                               {100, 1, 20, false, 0.000001},
-                                               {10, 1, 10, false, 0.000001},
-                                               {198, 1, 100, false, 0.000001},
-                                               {300, 3, 99, false, 0.000001},
-                                               {199, 1, 10, true, 0.000001},
-                                               {200, 15, 100, true, 0.000001},
-                                               {100, 1, 20, true, 0.000001},
-                                               {10, 1, 10, true, 0.000001},
-                                               {198, 1, 100, true, 0.000001},
-                                               {300, 3, 99, true, 0.000001}};
-
-// writing the test suite
-typedef completenessTest<int> completenessTestClass;
-TEST_P(completenessTestClass, Result)
-{
-  ASSERT_NEAR(computedCompleteness, truthCompleteness, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(completeness, completenessTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/contingencyMatrix.cu b/cpp/test/prims/contingencyMatrix.cu
deleted file mode 100644
index bbdb309682..0000000000
--- a/cpp/test/prims/contingencyMatrix.cu
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/contingencyMatrix.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-struct ContingencyMatrixParam {
-  int nElements;
-  int minClass;
-  int maxClass;
-  bool calcCardinality;
-  bool skipLabels;
-  float tolerance;
-};
-
-template <typename T>
-class ContingencyMatrixTest : public ::testing::TestWithParam<ContingencyMatrixParam> {
- protected:
-  ContingencyMatrixTest()
-    : pWorkspace(0, stream),
-      dY(0, stream),
-      dYHat(0, stream),
-      dComputedOutput(0, stream),
-      dGoldenOutput(0, stream)
-  {
-  }
-
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<ContingencyMatrixParam>::GetParam();
-
-    int numElements     = params.nElements;
-    int lowerLabelRange = params.minClass;
-    int upperLabelRange = params.maxClass;
-
-    std::vector<int> y(numElements, 0);
-    std::vector<int> y_hat(numElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(y.begin(), y.end(), [&]() { return intGenerator(dre); });
-    std::generate(y_hat.begin(), y_hat.end(), [&]() { return intGenerator(dre); });
-
-    if (params.skipLabels) {
-      // remove two label value from input arrays
-      int y1 = (upperLabelRange - lowerLabelRange) / 2;
-      int y2 = y1 + (upperLabelRange - lowerLabelRange) / 4;
-
-      // replacement values
-      int y1_R = y1 + 1;
-      int y2_R = y2 + 1;
-
-      std::replace(y.begin(), y.end(), y1, y1_R);
-      std::replace(y.begin(), y.end(), y2, y2_R);
-      std::replace(y_hat.begin(), y_hat.end(), y1, y1_R);
-      std::replace(y_hat.begin(), y_hat.end(), y2, y2_R);
-    }
-
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    dY.resize(numElements, stream);
-    dYHat.resize(numElements, stream);
-
-    raft::update_device(dYHat.data(), &y_hat[0], numElements, stream);
-    raft::update_device(dY.data(), &y[0], numElements, stream);
-
-    if (params.calcCardinality) {
-      MLCommon::Metrics::getInputClassCardinality(
-        dY.data(), numElements, stream, minLabel, maxLabel);
-    } else {
-      minLabel = lowerLabelRange;
-      maxLabel = upperLabelRange;
-    }
-
-    numUniqueClasses = maxLabel - minLabel + 1;
-
-    dComputedOutput.resize(numUniqueClasses * numUniqueClasses, stream);
-    dGoldenOutput.resize(numUniqueClasses * numUniqueClasses, stream);
-
-    // generate golden output on CPU
-    size_t sizeOfMat = numUniqueClasses * numUniqueClasses * sizeof(int);
-    std::vector<int> hGoldenOutput(sizeOfMat, 0);
-
-    for (int i = 0; i < numElements; i++) {
-      auto row    = y[i] - minLabel;
-      auto column = y_hat[i] - minLabel;
-      hGoldenOutput[row * numUniqueClasses + column] += 1;
-    }
-
-    raft::update_device(
-      dGoldenOutput.data(), hGoldenOutput.data(), numUniqueClasses * numUniqueClasses, stream);
-
-    workspaceSz = MLCommon::Metrics::getContingencyMatrixWorkspaceSize(
-      numElements, dY.data(), stream, minLabel, maxLabel);
-    pWorkspace.resize(workspaceSz, stream);
-    raft::interruptible::synchronize(stream);
-  }
-
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  void RunTest()
-  {
-    int numElements = params.nElements;
-    MLCommon::Metrics::contingencyMatrix(dY.data(),
-                                         dYHat.data(),
-                                         numElements,
-                                         dComputedOutput.data(),
-                                         stream,
-                                         (void*)pWorkspace.data(),
-                                         workspaceSz,
-                                         minLabel,
-                                         maxLabel);
-    ASSERT_TRUE(raft::devArrMatch(dComputedOutput.data(),
-                                  dGoldenOutput.data(),
-                                  numUniqueClasses * numUniqueClasses,
-                                  raft::Compare<T>()));
-  }
-
-  ContingencyMatrixParam params;
-  int numUniqueClasses = -1;
-  T minLabel, maxLabel;
-  cudaStream_t stream = 0;
-  size_t workspaceSz;
-  rmm::device_uvector<char> pWorkspace;
-  rmm::device_uvector<T> dY, dYHat;
-  rmm::device_uvector<int> dComputedOutput, dGoldenOutput;
-};
-
-const std::vector<ContingencyMatrixParam> inputs = {
-  {10000, 1, 10, true, false, 0.000001},
-  {10000, 1, 5000, true, false, 0.000001},
-  {10000, 1, 10000, true, false, 0.000001},
-  {10000, 1, 20000, true, false, 0.000001},
-  {10000, 1, 10, false, false, 0.000001},
-  {10000, 1, 5000, false, false, 0.000001},
-  {10000, 1, 10000, false, false, 0.000001},
-  {10000, 1, 20000, false, false, 0.000001},
-  {100000, 1, 100, false, false, 0.000001},
-  {1000000, 1, 1200, true, false, 0.000001},
-  {1000000, 1, 10000, false, false, 0.000001},
-  {100000, 1, 100, false, true, 0.000001},
-};
-
-typedef ContingencyMatrixTest<int> ContingencyMatrixTestS;
-TEST_P(ContingencyMatrixTestS, Result) { RunTest(); }
-INSTANTIATE_TEST_CASE_P(ContingencyMatrix, ContingencyMatrixTestS, ::testing::ValuesIn(inputs));
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/test/prims/dispersion.cu b/cpp/test/prims/dispersion.cu
deleted file mode 100644
index 78c0746ee4..0000000000
--- a/cpp/test/prims/dispersion.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <gtest/gtest.h>
-#include <metrics/dispersion.cuh>
-#include <raft/cuda_utils.cuh>
-#include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <rmm/device_uvector.hpp>
-#include <stdio.h>
-#include <stdlib.h>
-#include <vector>
-
-namespace MLCommon {
-namespace Metrics {
-
-template <typename T>
-struct DispersionInputs {
-  T tolerance;
-  int dim, clusters;
-  unsigned long long int seed;
-};
-
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const DispersionInputs<T>& dims)
-{
-  return os;
-}
-
-template <typename T>
-class DispersionTest : public ::testing::TestWithParam<DispersionInputs<T>> {
- protected:
-  DispersionTest() : exp_mean(0, stream), act_mean(0, stream) {}
-
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<DispersionInputs<T>>::GetParam();
-    raft::random::Rng r(params.seed);
-    int len = params.clusters * params.dim;
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    rmm::device_uvector<T> data(len, stream);
-    rmm::device_uvector<int> counts(params.clusters, stream);
-    exp_mean.resize(params.dim, stream);
-    act_mean.resize(params.dim, stream);
-    r.uniform(data.data(), len, (T)-1.0, (T)1.0, stream);
-    r.uniformInt(counts.data(), params.clusters, 1, 100, stream);
-    std::vector<int> h_counts(params.clusters, 0);
-    raft::update_host(&(h_counts[0]), counts.data(), params.clusters, stream);
-    npoints = 0;
-    for (const auto& val : h_counts) {
-      npoints += val;
-    }
-    actualVal = dispersion(
-      data.data(), counts.data(), act_mean.data(), params.clusters, npoints, params.dim, stream);
-    expectedVal = T(0);
-    std::vector<T> h_data(len, T(0));
-    raft::update_host(&(h_data[0]), data.data(), len, stream);
-    std::vector<T> mean(params.dim, T(0));
-    for (int i = 0; i < params.clusters; ++i) {
-      for (int j = 0; j < params.dim; ++j) {
-        mean[j] += h_data[i * params.dim + j] * T(h_counts[i]);
-      }
-    }
-    for (int i = 0; i < params.dim; ++i) {
-      mean[i] /= T(npoints);
-    }
-    raft::update_device(exp_mean.data(), &(mean[0]), params.dim, stream);
-    for (int i = 0; i < params.clusters; ++i) {
-      for (int j = 0; j < params.dim; ++j) {
-        auto diff = h_data[i * params.dim + j] - mean[j];
-        expectedVal += diff * diff * T(h_counts[i]);
-      }
-    }
-    expectedVal = sqrt(expectedVal);
-    raft::interruptible::synchronize(stream);
-  }
-
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
- protected:
-  DispersionInputs<T> params;
-  rmm::device_uvector<T> exp_mean, act_mean;
-  cudaStream_t stream = 0;
-  int npoints;
-  T expectedVal, actualVal;
-};
-
-const std::vector<DispersionInputs<float>> inputsf = {
-  {0.001f, 10, 1000, 1234ULL}, {0.001f, 100, 100, 1234ULL}, {0.001f, 1000, 1000, 1234ULL}};
-typedef DispersionTest<float> DispersionTestF;
-TEST_P(DispersionTestF, Result)
-{
-  auto eq = raft::CompareApprox<float>(params.tolerance);
-  ASSERT_TRUE(devArrMatch(exp_mean.data(), act_mean.data(), params.dim, eq));
-  ASSERT_TRUE(match(expectedVal, actualVal, eq));
-}
-INSTANTIATE_TEST_CASE_P(DispersionTests, DispersionTestF, ::testing::ValuesIn(inputsf));
-
-const std::vector<DispersionInputs<double>> inputsd = {
-  {0.001, 10, 1000, 1234ULL}, {0.001, 100, 100, 1234ULL}, {0.001, 1000, 1000, 1234ULL}};
-typedef DispersionTest<double> DispersionTestD;
-TEST_P(DispersionTestD, Result)
-{
-  auto eq = raft::CompareApprox<double>(params.tolerance);
-  ASSERT_TRUE(devArrMatch(exp_mean.data(), act_mean.data(), params.dim, eq));
-  ASSERT_TRUE(match(expectedVal, actualVal, eq));
-}
-INSTANTIATE_TEST_CASE_P(DispersionTests, DispersionTestD, ::testing::ValuesIn(inputsd));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/entropy.cu b/cpp/test/prims/entropy.cu
deleted file mode 100644
index 8afcfa60ef..0000000000
--- a/cpp/test/prims/entropy.cu
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/entropy.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/interruptible.hpp>
-#include <random>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-struct entropyParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class entropyTest : public ::testing::TestWithParam<entropyParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<entropyParam>::GetParam();
-
-    nElements       = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-
-    // generating the golden output
-    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
-
-    int* p = (int*)malloc(numUniqueClasses * sizeof(int));
-    memset(p, 0, numUniqueClasses * sizeof(int));
-
-    // calculating the bincount array
-    for (int i = 0; i < nElements; ++i) {
-      ++p[arr1[i] - lowerLabelRange];
-    }
-
-    // calculating the aggregate entropy
-    for (int i = 0; i < numUniqueClasses; ++i) {
-      if (p[i])
-        truthEntropy +=
-          -1 * (double(p[i]) / double(nElements)) * (log(double(p[i])) - log(double(nElements)));
-    }
-
-    // allocating and initializing memory to the GPU
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    rmm::device_uvector<T> clusterArray(nElements, stream);
-    raft::update_device(clusterArray.data(), &arr1[0], (int)nElements, stream);
-
-    raft::interruptible::synchronize(stream);
-    // calling the entropy CUDA implementation
-    computedEntropy = MLCommon::Metrics::entropy(
-      clusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-  }
-
-  // declaring the data values
-  entropyParam params;
-  T lowerLabelRange, upperLabelRange;
-
-  int nElements          = 0;
-  double truthEntropy    = 0;
-  double computedEntropy = 0;
-  cudaStream_t stream    = 0;
-};
-
-// setting test parameter values
-const std::vector<entropyParam> inputs = {{199, 1, 10, 0.000001},
-                                          {200, 15, 100, 0.000001},
-                                          {100, 1, 20, 0.000001},
-                                          {10, 1, 10, 0.000001},
-                                          {198, 1, 100, 0.000001},
-                                          {300, 3, 99, 0.000001},
-                                          {199, 1, 10, 0.000001},
-                                          {200, 15, 100, 0.000001},
-                                          {100, 1, 20, 0.000001},
-                                          {10, 1, 10, 0.000001},
-                                          {198, 1, 100, 0.000001},
-                                          {300, 3, 99, 0.000001}};
-
-// writing the test suite
-typedef entropyTest<int> entropyTestClass;
-TEST_P(entropyTestClass, Result) { ASSERT_NEAR(computedEntropy, truthEntropy, params.tolerance); }
-INSTANTIATE_TEST_CASE_P(entropy, entropyTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/homogeneity_score.cu b/cpp/test/prims/homogeneity_score.cu
deleted file mode 100644
index e9c7cd5fbb..0000000000
--- a/cpp/test/prims/homogeneity_score.cu
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/homogeneity_score.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct homogeneityParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  bool sameArrays;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class homogeneityTest : public ::testing::TestWithParam<homogeneityParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<homogeneityParam>::GetParam();
-
-    nElements       = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(nElements, 0);
-    std::vector<int> arr2(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    if (params.sameArrays) {
-      arr2 = arr1;
-    } else {
-      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-    }
-
-    // allocating and initializing memory to the GPU
-
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> truthClusterArray(nElements, stream);
-    rmm::device_uvector<T> predClusterArray(nElements, stream);
-    raft::update_device(truthClusterArray.data(), &arr1[0], (int)nElements, stream);
-    raft::update_device(predClusterArray.data(), &arr2[0], (int)nElements, stream);
-
-    // calculating the golden output
-    double truthMI, truthEntropy;
-
-    truthMI      = MLCommon::Metrics::mutual_info_score(truthClusterArray.data(),
-                                                   predClusterArray.data(),
-                                                   nElements,
-                                                   lowerLabelRange,
-                                                   upperLabelRange,
-                                                   stream);
-    truthEntropy = MLCommon::Metrics::entropy(
-      truthClusterArray.data(), nElements, lowerLabelRange, upperLabelRange, stream);
-
-    if (truthEntropy) {
-      truthHomogeneity = truthMI / truthEntropy;
-    } else
-      truthHomogeneity = 1.0;
-
-    if (nElements == 0) truthHomogeneity = 1.0;
-
-    // calling the homogeneity CUDA implementation
-    computedHomogeneity = MLCommon::Metrics::homogeneity_score(truthClusterArray.data(),
-                                                               predClusterArray.data(),
-                                                               nElements,
-                                                               lowerLabelRange,
-                                                               upperLabelRange,
-                                                               stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-  }
-
-  // declaring the data values
-  homogeneityParam params;
-  T lowerLabelRange, upperLabelRange;
-  int nElements              = 0;
-  double truthHomogeneity    = 0;
-  double computedHomogeneity = 0;
-  cudaStream_t stream        = 0;
-};
-
-// setting test parameter values
-const std::vector<homogeneityParam> inputs = {{199, 1, 10, false, 0.000001},
-                                              {200, 15, 100, false, 0.000001},
-                                              {100, 1, 20, false, 0.000001},
-                                              {10, 1, 10, false, 0.000001},
-                                              {198, 1, 100, false, 0.000001},
-                                              {300, 3, 99, false, 0.000001},
-                                              {199, 1, 10, true, 0.000001},
-                                              {200, 15, 100, true, 0.000001},
-                                              {100, 1, 20, true, 0.000001},
-                                              {10, 1, 10, true, 0.000001},
-                                              {198, 1, 100, true, 0.000001},
-                                              {300, 3, 99, true, 0.000001}};
-
-// writing the test suite
-typedef homogeneityTest<int> homogeneityTestClass;
-TEST_P(homogeneityTestClass, Result)
-{
-  ASSERT_NEAR(computedHomogeneity, truthHomogeneity, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(homogeneity, homogeneityTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/kl_divergence.cu b/cpp/test/prims/kl_divergence.cu
deleted file mode 100644
index 9e7991e4e0..0000000000
--- a/cpp/test/prims/kl_divergence.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/kl_divergence.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct klDivergenceParam {
-  int nElements;
-  double tolerance;
-};
-
-// test fixture class
-template <typename DataT>
-class klDivergenceTest : public ::testing::TestWithParam<klDivergenceParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<klDivergenceParam>::GetParam();
-
-    nElements = params.nElements;
-
-    // generating random value test input
-    std::vector<DataT> h_modelPDF(nElements, 0);
-    std::vector<DataT> h_candidatePDF(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_real_distribution<DataT> realGenerator(0.0, 1.0);
-
-    std::generate(h_modelPDF.begin(), h_modelPDF.end(), [&]() { return realGenerator(dre); });
-    std::generate(
-      h_candidatePDF.begin(), h_candidatePDF.end(), [&]() { return realGenerator(dre); });
-
-    // allocating and initializing memory to the GPU
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<DataT> d_modelPDF(nElements, stream);
-    rmm::device_uvector<DataT> d_candidatePDF(nElements, stream);
-    RAFT_CUDA_TRY(cudaMemset(d_modelPDF.data(), 0, d_modelPDF.size() * sizeof(DataT)));
-    RAFT_CUDA_TRY(cudaMemset(d_candidatePDF.data(), 0, d_candidatePDF.size() * sizeof(DataT)));
-
-    raft::update_device(d_modelPDF.data(), &h_modelPDF[0], (int)nElements, stream);
-    raft::update_device(d_candidatePDF.data(), &h_candidatePDF[0], (int)nElements, stream);
-
-    // generating the golden output
-    for (int i = 0; i < nElements; ++i) {
-      if (h_modelPDF[i] == 0.0)
-        truthklDivergence += 0;
-
-      else
-        truthklDivergence += h_modelPDF[i] * log(h_modelPDF[i] / h_candidatePDF[i]);
-    }
-
-    // calling the kl_divergence CUDA implementation
-    computedklDivergence =
-      MLCommon::Metrics::kl_divergence(d_modelPDF.data(), d_candidatePDF.data(), nElements, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-  }
-
-  // declaring the data values
-  klDivergenceParam params;
-  int nElements              = 0;
-  DataT truthklDivergence    = 0;
-  DataT computedklDivergence = 0;
-  cudaStream_t stream        = 0;
-};
-
-// setting test parameter values
-const std::vector<klDivergenceParam> inputs = {
-  {500, 0.000001}, {200, 0.001}, {5000, 0.000001}, {500000, 0.000001}
-
-};
-
-// writing the test suite
-typedef klDivergenceTest<double> klDivergenceTestClass;
-TEST_P(klDivergenceTestClass, Result)
-{
-  ASSERT_NEAR(computedklDivergence, truthklDivergence, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(klDivergence, klDivergenceTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/mutual_info_score.cu b/cpp/test/prims/mutual_info_score.cu
deleted file mode 100644
index 1087256fe3..0000000000
--- a/cpp/test/prims/mutual_info_score.cu
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/contingencyMatrix.cuh>
-#include <metrics/mutual_info_score.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct mutualInfoParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  bool sameArrays;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class mutualInfoTest : public ::testing::TestWithParam<mutualInfoParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<mutualInfoParam>::GetParam();
-
-    nElements       = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(nElements, 0);
-    std::vector<int> arr2(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    if (params.sameArrays) {
-      arr2 = arr1;
-    } else {
-      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-    }
-
-    // generating the golden output
-    // calculating the contingency matrix
-    int numUniqueClasses = upperLabelRange - lowerLabelRange + 1;
-    size_t sizeOfMat     = numUniqueClasses * numUniqueClasses * sizeof(int);
-    int* hGoldenOutput   = (int*)malloc(sizeOfMat);
-    memset(hGoldenOutput, 0, sizeOfMat);
-    int i, j;
-    for (i = 0; i < nElements; i++) {
-      int row    = arr1[i] - lowerLabelRange;
-      int column = arr2[i] - lowerLabelRange;
-
-      hGoldenOutput[row * numUniqueClasses + column] += 1;
-    }
-
-    int* a = (int*)malloc(numUniqueClasses * sizeof(int));
-    int* b = (int*)malloc(numUniqueClasses * sizeof(int));
-    memset(a, 0, numUniqueClasses * sizeof(int));
-    memset(b, 0, numUniqueClasses * sizeof(int));
-
-    // and also the reducing contingency matrix along row and column
-    for (i = 0; i < numUniqueClasses; ++i) {
-      for (j = 0; j < numUniqueClasses; ++j) {
-        a[i] += hGoldenOutput[i * numUniqueClasses + j];
-        b[i] += hGoldenOutput[j * numUniqueClasses + i];
-      }
-    }
-
-    // calculating the truth mutual information
-    for (int i = 0; i < numUniqueClasses; ++i) {
-      for (int j = 0; j < numUniqueClasses; ++j) {
-        if (a[i] * b[j] != 0 && hGoldenOutput[i * numUniqueClasses + j] != 0) {
-          truthmutualInfo +=
-            (double)(hGoldenOutput[i * numUniqueClasses + j]) *
-            (log((double)(double(nElements) * hGoldenOutput[i * numUniqueClasses + j])) -
-             log((double)(a[i] * b[j])));
-        }
-      }
-    }
-
-    truthmutualInfo /= nElements;
-
-    // allocating and initializing memory to the GPU
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> firstClusterArray(nElements, stream);
-    rmm::device_uvector<T> secondClusterArray(nElements, stream);
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
-
-    raft::update_device(firstClusterArray.data(), &arr1[0], (int)nElements, stream);
-    raft::update_device(secondClusterArray.data(), &arr2[0], (int)nElements, stream);
-
-    // calling the mutualInfo CUDA implementation
-    computedmutualInfo = MLCommon::Metrics::mutual_info_score(firstClusterArray.data(),
-                                                              secondClusterArray.data(),
-                                                              nElements,
-                                                              lowerLabelRange,
-                                                              upperLabelRange,
-                                                              stream);
-  }
-
-  // the destructor
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  // declaring the data values
-  mutualInfoParam params;
-  T lowerLabelRange, upperLabelRange;
-  int nElements             = 0;
-  double truthmutualInfo    = 0;
-  double computedmutualInfo = 0;
-  cudaStream_t stream       = 0;
-};
-
-// setting test parameter values
-const std::vector<mutualInfoParam> inputs = {{199, 1, 10, false, 0.000001},
-                                             {200, 15, 100, false, 0.000001},
-                                             {100, 1, 20, false, 0.000001},
-                                             {10, 1, 10, false, 0.000001},
-                                             {198, 1, 100, false, 0.000001},
-                                             {300, 3, 99, false, 0.000001},
-                                             {199, 1, 10, true, 0.000001},
-                                             {200, 15, 100, true, 0.000001},
-                                             {100, 1, 20, true, 0.000001},
-                                             {10, 1, 10, true, 0.000001},
-                                             {198, 1, 100, true, 0.000001},
-                                             {300, 3, 99, true, 0.000001}};
-
-// writing the test suite
-typedef mutualInfoTest<int> mutualInfoTestClass;
-TEST_P(mutualInfoTestClass, Result)
-{
-  ASSERT_NEAR(computedmutualInfo, truthmutualInfo, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(mutualInfo, mutualInfoTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/rand_index.cu b/cpp/test/prims/rand_index.cu
deleted file mode 100644
index 9deab5f17a..0000000000
--- a/cpp/test/prims/rand_index.cu
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-
-#include <metrics/rand_index.cuh>
-
-#include <raft/cudart_utils.h>
-
-#include <gtest/gtest.h>
-
-#include <algorithm>
-#include <iostream>
-#include <metrics/rand_index.cuh>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct randIndexParam {
-  uint64_t nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class randIndexTest : public ::testing::TestWithParam<randIndexParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<randIndexParam>::GetParam();
-
-    size            = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(size, 0);
-    std::vector<int> arr2(size, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-
-    // generating the golden output
-    int64_t a_truth = 0;
-    int64_t b_truth = 0;
-
-    for (uint64_t iter = 0; iter < size; ++iter) {
-      for (uint64_t jiter = 0; jiter < iter; ++jiter) {
-        if (arr1[iter] == arr1[jiter] && arr2[iter] == arr2[jiter]) {
-          ++a_truth;
-        } else if (arr1[iter] != arr1[jiter] && arr2[iter] != arr2[jiter]) {
-          ++b_truth;
-        }
-      }
-    }
-    uint64_t nChooseTwo = (size * (size - 1)) / 2;
-    truthRandIndex      = (double)(((double)(a_truth + b_truth)) / (double)nChooseTwo);
-
-    // allocating and initializing memory to the GPU
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> firstClusterArray(size, stream);
-    rmm::device_uvector<T> secondClusterArray(size, stream);
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(firstClusterArray.data(), 0, firstClusterArray.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(
-      cudaMemsetAsync(secondClusterArray.data(), 0, secondClusterArray.size() * sizeof(T), stream));
-
-    raft::update_device(firstClusterArray.data(), &arr1[0], (int)size, stream);
-    raft::update_device(secondClusterArray.data(), &arr2[0], (int)size, stream);
-
-    // calling the rand_index CUDA implementation
-    computedRandIndex = MLCommon::Metrics::compute_rand_index(
-      firstClusterArray.data(), secondClusterArray.data(), size, stream);
-  }
-
-  // the destructor
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  // declaring the data values
-  randIndexParam params;
-  int lowerLabelRange = 0, upperLabelRange = 2;
-  uint64_t size            = 0;
-  double truthRandIndex    = 0;
-  double computedRandIndex = 0;
-  cudaStream_t stream      = 0;
-};
-
-// setting test parameter values
-const std::vector<randIndexParam> inputs = {{199, 1, 10, 0.000001},
-                                            {200, 1, 100, 0.000001},
-                                            {10, 1, 1200, 0.000001},
-                                            {100, 1, 10000, 0.000001},
-                                            {198, 1, 100, 0.000001},
-                                            {300, 3, 99, 0.000001},
-                                            {2, 0, 0, 0.00001}};
-
-// writing the test suite
-typedef randIndexTest<int> randIndexTestClass;
-TEST_P(randIndexTestClass, Result)
-{
-  ASSERT_NEAR(computedRandIndex, truthRandIndex, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(randIndex, randIndexTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/score.cu b/cpp/test/prims/score.cu
deleted file mode 100644
index e0a0102c39..0000000000
--- a/cpp/test/prims/score.cu
+++ /dev/null
@@ -1,481 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/scores.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/interruptible.hpp>
-#include <raft/random/rng.hpp>
-#include <rmm/device_uvector.hpp>
-#include <vector>
-
-namespace MLCommon {
-namespace Score {
-
-class ScoreTest : public ::testing::Test {
- protected:
-  void SetUp() override {}
-
-  void TearDown() override {}
-};
-
-typedef ScoreTest ScoreTestHighScore;
-TEST(ScoreTestHighScore, Result)
-{
-  float y[5]     = {0.1, 0.2, 0.3, 0.4, 0.5};
-  float y_hat[5] = {0.12, 0.22, 0.32, 0.42, 0.52};
-
-  cudaStream_t stream = 0;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-  rmm::device_uvector<float> d_y(5, stream);
-  rmm::device_uvector<float> d_y_hat(5, stream);
-
-  raft::update_device(d_y_hat.data(), y_hat, 5, stream);
-  raft::update_device(d_y.data(), y, 5, stream);
-
-  auto result = MLCommon::Score::r2_score(d_y.data(), d_y_hat.data(), 5, stream);
-  ASSERT_TRUE(result == 0.98f);
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-typedef ScoreTest ScoreTestLowScore;
-TEST(ScoreTestLowScore, Result)
-{
-  float y[5]     = {0.1, 0.2, 0.3, 0.4, 0.5};
-  float y_hat[5] = {0.012, 0.022, 0.032, 0.042, 0.052};
-
-  cudaStream_t stream = 0;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-  rmm::device_uvector<float> d_y(5, stream);
-  rmm::device_uvector<float> d_y_hat(5, stream);
-
-  raft::update_device(d_y_hat.data(), y_hat, 5, stream);
-  raft::update_device(d_y.data(), y, 5, stream);
-
-  auto result = MLCommon::Score::r2_score(d_y.data(), d_y_hat.data(), 5, stream);
-
-  std::cout << "Result: " << result - -3.4012f << std::endl;
-  ASSERT_TRUE(result - -3.4012f < 0.00001);
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-
-// Tests for accuracy_score
-
-struct AccuracyInputs {
-  /**
-   * Number of predictions.
-   */
-  int n;
-  /**
-   * Number of predictions w/ different values than their corresponding element in reference
-   * predictions. Valid range [0, n]. changed_n in [0, n] will yield accuracy of (n - changed_n) /
-   * n.
-   */
-  int changed_n;
-  /**
-   * Seed for randomly generated predictions.
-   */
-  unsigned long long int seed;
-};
-
-std::ostream& operator<<(::std::ostream& os, const AccuracyInputs& acc_inputs)
-{
-  os << "AccuracyInputs are {" << acc_inputs.n << ", " << acc_inputs.changed_n << ", "
-     << acc_inputs.seed << "}" << std::endl;
-  return os;
-}
-
-template <typename T>
-__global__ void change_vals(T* predictions, T* ref_predictions, const int changed_n)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < changed_n) {
-    predictions[tid] = ref_predictions[tid] + 1;  // change first changed_n predictions
-  }
-}
-
-template <typename T>
-class AccuracyTest : public ::testing::TestWithParam<AccuracyInputs> {
- protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<AccuracyInputs>::GetParam();
-    ASSERT((params.changed_n <= params.n) && (params.changed_n >= 0), "Invalid params.");
-
-    raft::random::Rng r(params.seed);
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> predictions(params.n, stream);
-    rmm::device_uvector<T> ref_predictions(params.n, stream);
-    r.normal(ref_predictions.data(), params.n, (T)0.0, (T)1.0, stream);
-    raft::copy_async(predictions.data(), ref_predictions.data(), params.n, stream);
-    raft::interruptible::synchronize(stream);
-
-    // Modify params.changed_n unique predictions to a different value. New value is irrelevant.
-    if (params.changed_n > 0) {
-      int threads = 64;
-      int blocks  = raft::ceildiv(params.changed_n, threads);
-      //@todo Could also generate params.changed_n unique random positions in [0, n) range, instead
-      // of changing the first ones.
-      change_vals<T><<<blocks, threads, 0, stream>>>(
-        predictions.data(), ref_predictions.data(), params.changed_n);
-      RAFT_CUDA_TRY(cudaGetLastError());
-      raft::interruptible::synchronize(stream);
-    }
-
-    computed_accuracy = MLCommon::Score::accuracy_score<T>(
-      predictions.data(), ref_predictions.data(), params.n, stream);
-    ref_accuracy = (params.n - params.changed_n) * 1.0f / params.n;
-    // std::cout << "computed_accuracy is " << computed_accuracy << " ref_accuracy is " <<
-    // ref_accuracy << std::endl;
-  }
-
-  void TearDown() override
-  {
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-    computed_accuracy = -1.0f;
-    ref_accuracy      = -1.0f;
-  }
-
-  AccuracyInputs params;
-  float computed_accuracy, ref_accuracy;
-  cudaStream_t stream = 0;
-};
-
-const std::vector<AccuracyInputs> inputs = {
-  {1, 1, 1234ULL},        // single element, wrong prediction
-  {1, 0, 1234ULL},        // single element, perfect prediction
-  {2, 1, 1234ULL},        // multiple elements, 0.5 accuracy
-  {1000, 0, 1234ULL},     // multiple elements, perfect predictions
-  {1000, 1000, 1234ULL},  // multiple elements, no correct predictions
-  {1000, 80, 1234ULL},    // multiple elements, prediction mix
-  {1000, 45, 1234ULL}     // multiple elements, prediction mix
-};
-
-typedef AccuracyTest<float> AccuracyTestF;
-TEST_P(AccuracyTestF, Result) { ASSERT_TRUE(computed_accuracy == ref_accuracy); }
-
-typedef AccuracyTest<double> AccuracyTestD;
-TEST_P(AccuracyTestD, Result) { ASSERT_TRUE(computed_accuracy == ref_accuracy); }
-
-INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestF, ::testing::ValuesIn(inputs));
-INSTANTIATE_TEST_CASE_P(AccuracyTests, AccuracyTestD, ::testing::ValuesIn(inputs));
-
-// Tests for regression_metrics
-
-template <typename T>
-struct RegressionInputs {
-  T tolerance;
-  int n;                 // number of predictions
-  bool hardcoded_preds;  // (hardcoded_preds) ? use predictions, ref_predictions : use randomly
-                         // generated arrays.
-  std::vector<T> predictions;
-  std::vector<T> ref_predictions;
-  T predictions_range[2];      // predictions in predictions_range if not hardcoded_preds
-  T ref_predictions_range[2];  // predictions in ref_predictions_range if not hardcoded_preds
-  unsigned long long int seed;
-};
-
-template <typename T>
-std::ostream& operator<<(std::ostream& os, const RegressionInputs<T>& reg_inputs)
-{
-  os << "RegressionInputs are {" << reg_inputs.tolerance << ", " << reg_inputs.n << ", "
-     << reg_inputs.hardcoded_preds << ", ";
-  if (reg_inputs.hardcoded_preds) {
-    os << "{";
-    for (int i = 0; i < reg_inputs.n; i++)
-      os << reg_inputs.predictions[i] << ", ";
-    os << "}, {";
-    for (int i = 0; i < reg_inputs.n; i++)
-      os << reg_inputs.ref_predictions[i] << ", ";
-    os << "}";
-    os << "{" << reg_inputs.predictions_range[0] << ", " << reg_inputs.predictions_range[1]
-       << "}, ";
-    os << "{" << reg_inputs.ref_predictions_range[0] << ", " << reg_inputs.ref_predictions_range[1]
-       << "}";
-  } else {
-    os << "{}, {}, {}, {}";
-  }
-  os << ", " << reg_inputs.seed;
-  return os;
-}
-
-template <typename T>
-void host_regression_computations(std::vector<T>& predictions,
-                                  std::vector<T>& ref_predictions,
-                                  const int n,
-                                  std::vector<double>& regression_metrics)
-{
-  double abs_difference_sum = 0;
-  double mse_sum            = 0;
-  std::vector<double> abs_diffs(n);
-
-  for (int i = 0; i < n; i++) {
-    double abs_diff = raft::abs(predictions[i] - ref_predictions[i]);
-    abs_difference_sum += abs_diff;
-    mse_sum += pow(predictions[i] - ref_predictions[i], 2);
-    abs_diffs[i] = abs_diff;
-  }
-
-  regression_metrics[0] = abs_difference_sum / n;
-  regression_metrics[1] = mse_sum / n;
-
-  std::sort(abs_diffs.begin(), abs_diffs.end());
-  int middle = n / 2;
-  if (n % 2 == 1) {
-    regression_metrics[2] = abs_diffs[middle];
-  } else {
-    regression_metrics[2] = (abs_diffs[middle] + abs_diffs[middle - 1]) / 2;
-  }
-}
-
-template <typename T>
-class RegressionMetricsTest : public ::testing::TestWithParam<RegressionInputs<T>> {
- protected:
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<RegressionInputs<T>>::GetParam();
-    computed_regression_metrics.assign(3, -1.0);
-    ref_regression_metrics.assign(3, -1.0);
-
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    rmm::device_uvector<T> d_predictions(params.n, stream);
-    rmm::device_uvector<T> d_ref_predictions(params.n, stream);
-
-    if (params.hardcoded_preds) {
-      raft::update_device(d_predictions.data(), params.predictions.data(), params.n, stream);
-      raft::update_device(
-        d_ref_predictions.data(), params.ref_predictions.data(), params.n, stream);
-    } else {
-      params.predictions.resize(params.n);
-      params.ref_predictions.resize(params.n);
-      raft::random::Rng r(params.seed);
-      // randomly generate arrays
-      r.uniform(d_predictions.data(),
-                params.n,
-                params.predictions_range[0],
-                params.predictions_range[1],
-                stream);
-      r.uniform(d_ref_predictions.data(),
-                params.n,
-                params.ref_predictions_range[0],
-                params.ref_predictions_range[1],
-                stream);
-      // copy to host to compute reference regression metrics
-      raft::update_host(params.predictions.data(), d_predictions.data(), params.n, stream);
-      raft::update_host(params.ref_predictions.data(), d_ref_predictions.data(), params.n, stream);
-      raft::interruptible::synchronize(stream);
-    }
-
-    MLCommon::Score::regression_metrics(d_predictions.data(),
-                                        d_ref_predictions.data(),
-                                        params.n,
-                                        stream,
-                                        computed_regression_metrics[0],
-                                        computed_regression_metrics[1],
-                                        computed_regression_metrics[2]);
-
-    host_regression_computations(
-      params.predictions, params.ref_predictions, params.n, ref_regression_metrics);
-    raft::interruptible::synchronize(stream);
-  }
-
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  RegressionInputs<T> params;
-  std::vector<double> computed_regression_metrics;
-  std::vector<double> ref_regression_metrics;
-  cudaStream_t stream = 0;
-};
-
-const std::vector<RegressionInputs<float>> regression_inputs_float = {
-  {0.00001f, 1, true, {10.2f}, {20.2f}, {}, {}, 1234ULL},                // single element
-  {0.00001f, 2, true, {10.2f, 40.2f}, {20.2f, 80.2f}, {}, {}, 1234ULL},  // two elements, mean same
-                                                                         // as median
-  // next three inputs should result in identical regression metrics values
-  {0.00001f,
-   6,
-   true,
-   {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f},
-   {20.5f, 40.5f, 55.5f, 80.5f, 100.5f, 120.5f},
-   {},
-   {},
-   1234ULL},  // diffs all negative, reverse sorted
-  {0.00001f,
-   6,
-   true,
-   {20.5f, 40.5f, 55.5f, 80.5f, 100.5f, 120.5f},
-   {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f},
-   {},
-   {},
-   1234ULL},  // diffs all positive, already sorted
-  {0.00001f,
-   6,
-   true,
-   {40.5f, 55.5f, 20.5f, 120.5f, 100.5f, 80.5f},
-   {20.5f, 30.5f, 10.5f, 60.5f, 50.5f, 40.5f},
-   {},
-   {},
-   1234ULL},  // mix
-  {0.00001f,
-   6,
-   true,
-   {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f},
-   {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f},
-   {},
-   {},
-   1234ULL},  // identical predictions (0 error)
-  {0.00001f,
-   6,
-   true,
-   {10.5f, 20.5f, 30.5f, 40.5f, 50.5f, 60.5f},
-   {20.5f, 30.5f, 40.5f, 50.5f, 60.5f, 70.5f},
-   {},
-   {},
-   1234ULL},  // predictions[i] - ref_predictions[i] const for each i
-  {0.00001f,
-   2048,
-   false,
-   {},
-   {},
-   {-2048.0f, 2048.0f},
-   {-2048.0f, 2048.0f},
-   1234ULL},  // random mix, even number of elements
-  {0.00001f,
-   2049,
-   false,
-   {},
-   {},
-   {-2048.0f, 2048.0f},
-   {-2048.0f, 2048.0f},
-   1234ULL},  // random mix, odd number of elements
-  {0.00001f,
-   1024,
-   false,
-   {},
-   {},
-   {0.0f, 2048.0f},
-   {8192.0f, 16384.0f},
-   1234ULL},  // random mix, diffs are all negative
-  {0.00001f,
-   1024,
-   false,
-   {},
-   {},
-   {8192.0f, 16384.0f},
-   {0.0f, 2048.0f},
-   1234ULL}  // random mix, diffs are all positive
-};
-
-const std::vector<RegressionInputs<double>> regression_inputs_double = {
-  {0.0000001, 1, true, {10.2}, {20.2}, {}, {}, 1234ULL},              // single element
-  {0.0000001, 2, true, {10.2, 40.2}, {20.2, 80.2}, {}, {}, 1234ULL},  // two elements
-  {0.0000001,
-   6,
-   true,
-   {10.5, 20.5, 30.5, 40.5, 50.5, 60.5},
-   {20.5, 40.5, 55.5, 80.5, 100.5, 120.5},
-   {},
-   {},
-   1234ULL},  // diffs all negative, reverse sorted
-  {0.0000001,
-   6,
-   true,
-   {20.5, 40.5, 55.5, 80.5, 100.5, 120.5},
-   {10.5, 20.5, 30.5, 40.5, 50.5, 60.5},
-   {},
-   {},
-   1234ULL},  // diffs all positive, already sorted
-  {0.0000001,
-   6,
-   true,
-   {40.5, 55.5, 20.5, 120.5, 100.5, 80.5},
-   {20.5, 30.5, 10.5, 60.5, 50.5, 40.5},
-   {},
-   {},
-   1234ULL},  // mix
-  {0.0000001,
-   6,
-   true,
-   {10.5, 20.5, 30.5, 40.5, 50.5, 60.5},
-   {10.5, 20.5, 30.5, 40.5, 50.5, 60.5},
-   {},
-   {},
-   1234ULL},  // identical predictions (0 error)
-  {0.0000001,
-   6,
-   true,
-   {10.5, 20.5, 30.5, 40.5, 50.5, 60.5},
-   {20.5, 30.5, 40.5, 50.5, 60.5, 70.5},
-   {},
-   {},
-   1234ULL},  // predictions[i] - ref_predictions[i] const for each i
-  {0.0000001,
-   2048,
-   false,
-   {},
-   {},
-   {-2048.0, 2048.0},
-   {-2048.0, 2048.0},
-   1234ULL},  // random mix, even number of elements
-  {0.0000001,
-   2049,
-   false,
-   {},
-   {},
-   {-2048.0, 2048.0},
-   {-2048.0, 2048.0},
-   1234ULL},  // random mix, odd number of elements
-  {0.0000001, 1024, false, {}, {}, {0, 2048}, {8192.0, 16384.0}, 1234ULL},  // random mix, diffs are
-                                                                            // all negative
-  {0.0000001, 1024, false, {}, {}, {8192.0, 16384.0}, {0.0, 2048}, 1234ULL}  // random mix, diffs
-                                                                             // are all positive
-};
-
-typedef RegressionMetricsTest<float> RegressionMetricsTestF;
-TEST_P(RegressionMetricsTestF, Result)
-{
-  for (int i = 0; i < 3; i++) {
-    ASSERT_TRUE(match(computed_regression_metrics[i],
-                      ref_regression_metrics[i],
-                      raft::CompareApprox<float>(params.tolerance)));
-  }
-}
-
-typedef RegressionMetricsTest<double> RegressionMetricsTestD;
-TEST_P(RegressionMetricsTestD, Result)
-{
-  for (int i = 0; i < 3; i++) {
-    ASSERT_TRUE(match(computed_regression_metrics[i],
-                      ref_regression_metrics[i],
-                      raft::CompareApprox<double>(params.tolerance)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(RegressionMetricsTests,
-                        RegressionMetricsTestF,
-                        ::testing::ValuesIn(regression_inputs_float));
-INSTANTIATE_TEST_CASE_P(RegressionMetricsTests,
-                        RegressionMetricsTestD,
-                        ::testing::ValuesIn(regression_inputs_double));
-
-}  // end namespace Score
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/silhouette_score.cu b/cpp/test/prims/silhouette_score.cu
deleted file mode 100644
index f42e038e4e..0000000000
--- a/cpp/test/prims/silhouette_score.cu
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <cuml/metrics/metrics.hpp>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/batched/silhouette_score.cuh>
-#include <metrics/silhouette_score.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance_type.hpp>
-#include <random>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct silhouetteScoreParam {
-  int nRows;
-  int nCols;
-  int nLabels;
-  raft::distance::DistanceType metric;
-  int chunk;
-  double tolerance;
-};
-
-// test fixture class
-template <typename LabelT, typename DataT>
-class silhouetteScoreTest : public ::testing::TestWithParam<silhouetteScoreParam> {
- protected:
-  silhouetteScoreTest()
-    : d_X(0, handle.get_stream()),
-      sampleSilScore(0, handle.get_stream()),
-      d_labels(0, handle.get_stream())
-  {
-  }
-
-  void host_silhouette_score()
-  {
-    // generating random value test input
-    std::vector<double> h_X(nElements, 0.0);
-    std::vector<int> h_labels(nRows, 0);
-    std::random_device rd;
-    std::default_random_engine dre(nElements * nLabels);
-    std::uniform_int_distribution<int> intGenerator(0, nLabels - 1);
-    std::uniform_real_distribution<double> realGenerator(0, 100);
-
-    std::generate(h_X.begin(), h_X.end(), [&]() { return realGenerator(dre); });
-    std::generate(h_labels.begin(), h_labels.end(), [&]() { return intGenerator(dre); });
-
-    // allocating and initializing memory to the GPU
-    auto stream = handle.get_stream();
-    d_X.resize(nElements, stream);
-    d_labels.resize(nElements, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(d_X.data(), 0, d_X.size() * sizeof(DataT), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(d_labels.data(), 0, d_labels.size() * sizeof(LabelT), stream));
-    sampleSilScore.resize(nElements, stream);
-
-    raft::update_device(d_X.data(), &h_X[0], (int)nElements, stream);
-    raft::update_device(d_labels.data(), &h_labels[0], (int)nElements, stream);
-
-    // finding the distance matrix
-
-    rmm::device_uvector<double> d_distanceMatrix(nRows * nRows, stream);
-    double* h_distanceMatrix = (double*)malloc(nRows * nRows * sizeof(double*));
-
-    ML::Metrics::pairwise_distance(
-      handle, d_X.data(), d_X.data(), d_distanceMatrix.data(), nRows, nRows, nCols, params.metric);
-
-    handle.sync_stream(stream);
-
-    raft::update_host(h_distanceMatrix, d_distanceMatrix.data(), nRows * nRows, stream);
-
-    // finding the bincount array
-
-    double* binCountArray = (double*)malloc(nLabels * sizeof(double*));
-    memset(binCountArray, 0, nLabels * sizeof(double));
-
-    for (int i = 0; i < nRows; ++i) {
-      binCountArray[h_labels[i]] += 1;
-    }
-
-    // finding the average intra cluster distance for every element
-
-    double* a = (double*)malloc(nRows * sizeof(double*));
-
-    for (int i = 0; i < nRows; ++i) {
-      int myLabel               = h_labels[i];
-      double sumOfIntraClusterD = 0;
-
-      for (int j = 0; j < nRows; ++j) {
-        if (h_labels[j] == myLabel) { sumOfIntraClusterD += h_distanceMatrix[i * nRows + j]; }
-      }
-
-      if (binCountArray[myLabel] <= 1)
-        a[i] = -1;
-      else
-        a[i] = sumOfIntraClusterD / (binCountArray[myLabel] - 1);
-    }
-
-    // finding the average inter cluster distance for every element
-
-    double* b = (double*)malloc(nRows * sizeof(double*));
-
-    for (int i = 0; i < nRows; ++i) {
-      int myLabel          = h_labels[i];
-      double minAvgInterCD = ULLONG_MAX;
-
-      for (int j = 0; j < nLabels; ++j) {
-        int curClLabel = j;
-        if (curClLabel == myLabel) continue;
-        double avgInterCD = 0;
-
-        for (int k = 0; k < nRows; ++k) {
-          if (h_labels[k] == curClLabel) { avgInterCD += h_distanceMatrix[i * nRows + k]; }
-        }
-
-        if (binCountArray[curClLabel])
-          avgInterCD /= binCountArray[curClLabel];
-        else
-          avgInterCD = ULLONG_MAX;
-        minAvgInterCD = min(minAvgInterCD, avgInterCD);
-      }
-
-      b[i] = minAvgInterCD;
-    }
-
-    // finding the silhouette score for every element
-
-    double* truthSampleSilScore = (double*)malloc(nRows * sizeof(double*));
-    for (int i = 0; i < nRows; ++i) {
-      if (a[i] == -1)
-        truthSampleSilScore[i] = 0;
-      else if (a[i] == 0 && b[i] == 0)
-        truthSampleSilScore[i] = 0;
-      else
-        truthSampleSilScore[i] = (b[i] - a[i]) / max(a[i], b[i]);
-      truthSilhouetteScore += truthSampleSilScore[i];
-    }
-
-    truthSilhouetteScore /= nRows;
-  }
-
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<silhouetteScoreParam>::GetParam();
-
-    nRows     = params.nRows;
-    nCols     = params.nCols;
-    nLabels   = params.nLabels;
-    chunk     = params.chunk;
-    nElements = nRows * nCols;
-
-    host_silhouette_score();
-
-    // calling the silhouette_score CUDA implementation
-    computedSilhouetteScore = MLCommon::Metrics::silhouette_score(handle,
-                                                                  d_X.data(),
-                                                                  nRows,
-                                                                  nCols,
-                                                                  d_labels.data(),
-                                                                  nLabels,
-                                                                  sampleSilScore.data(),
-                                                                  handle.get_stream(),
-                                                                  params.metric);
-
-    batchedSilhouetteScore = Batched::silhouette_score(handle,
-                                                       d_X.data(),
-                                                       nRows,
-                                                       nCols,
-                                                       d_labels.data(),
-                                                       nLabels,
-                                                       sampleSilScore.data(),
-                                                       chunk,
-                                                       params.metric);
-  }
-
-  // declaring the data values
-  silhouetteScoreParam params;
-  int nLabels;
-  rmm::device_uvector<DataT> d_X;
-  rmm::device_uvector<DataT> sampleSilScore;
-  rmm::device_uvector<LabelT> d_labels;
-  int nRows;
-  int nCols;
-  int nElements;
-  double truthSilhouetteScore    = 0;
-  double computedSilhouetteScore = 0;
-  double batchedSilhouetteScore  = 0;
-  raft::handle_t handle;
-  int chunk;
-};
-
-// setting test parameter values
-const std::vector<silhouetteScoreParam> inputs = {
-  {4, 2, 3, raft::distance::DistanceType::L2Expanded, 4, 0.00001},
-  {4, 2, 2, raft::distance::DistanceType::L2SqrtUnexpanded, 2, 0.00001},
-  {8, 8, 3, raft::distance::DistanceType::L2Unexpanded, 4, 0.00001},
-  {11, 2, 5, raft::distance::DistanceType::L2Expanded, 3, 0.00001},
-  {40, 2, 8, raft::distance::DistanceType::L2Expanded, 10, 0.00001},
-  {12, 7, 3, raft::distance::DistanceType::CosineExpanded, 8, 0.00001},
-  {7, 5, 5, raft::distance::DistanceType::L1, 2, 0.00001}};
-
-// writing the test suite
-typedef silhouetteScoreTest<int, double> silhouetteScoreTestClass;
-TEST_P(silhouetteScoreTestClass, Result)
-{
-  ASSERT_NEAR(computedSilhouetteScore, truthSilhouetteScore, params.tolerance);
-  ASSERT_NEAR(batchedSilhouetteScore, truthSilhouetteScore, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(silhouetteScore, silhouetteScoreTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/trustworthiness.cu b/cpp/test/prims/trustworthiness.cu
deleted file mode 100644
index f51f42ffa4..0000000000
--- a/cpp/test/prims/trustworthiness.cu
+++ /dev/null
@@ -1,335 +0,0 @@
-/*
- * Copyright (c) 2018-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/trustworthiness_score.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <vector>
-
-namespace MLCommon {
-namespace Score {
-
-class TrustworthinessScoreTest : public ::testing::Test {
- protected:
-  void basicTest()
-  {
-    std::vector<float> X = {
-      5.6142087,   8.59787,     -4.382763,   -3.6452143,  -5.8816037,  -0.6330313,  4.6920023,
-      -0.79210913, 0.6106314,   2.1210914,   5.919943,    -8.43784,    -6.4819884,  0.41001374,
-      -6.1052523,  -4.0825715,  -5.314755,   -2.834671,   5.751696,    -6.5012555,  -0.4719201,
-      -7.53353,    7.6789393,   -1.4959852,  -5.5977287,  -9.564147,   1.2902534,   3.559834,
-      -6.7659483,  8.265964,    4.595404,    9.133477,    -6.1553917,  -6.319754,   -2.9039452,
-      4.4150834,   -3.094395,   -4.426273,   9.584571,    -5.64133,    6.6209483,   7.4044604,
-      3.9620576,   5.639907,    10.33007,    -0.8792053,  5.143776,    -7.464049,   1.2448754,
-      -5.6300974,  5.4518576,   4.119535,    6.749645,    7.627064,    -7.2298336,  1.9681473,
-      -6.9083176,  6.404673,    0.07186685,  9.0994835,   8.51037,     -8.986389,   0.40534487,
-      2.115397,    4.086756,    1.2284287,   -2.6272132,  0.06527536,  -9.587425,   -7.206078,
-      7.864875,    7.4397306,   -6.9233336,  -2.6643622,  3.3466153,   7.0408177,   -3.6069896,
-      -9.971769,   4.4075623,   7.9063697,   2.559074,    4.323717,    1.6867131,   -1.1576937,
-      -9.893141,   -3.251416,   -7.4889135,  -4.0588717,  -2.73338,    -7.4852257,  3.4460473,
-      9.759119,    -5.4680476,  -4.722435,   -8.032619,   -1.4598992,  4.227361,    3.135568,
-      1.1950601,   1.1982028,   6.998856,    -6.131138,   -6.6921015,  0.5361224,   -7.1213965,
-      -5.6104236,  -7.2212887,  -2.2710054,  8.544764,    -6.0254574,  1.4582269,   -5.5587835,
-      8.031556,    -0.26328218, -5.2591386,  -9.262641,   2.8691363,   5.299787,    -9.209455,
-      8.523085,    5.180329,    10.655528,   -5.7171874,  -6.7739563,  -3.6306462,  4.067106,
-      -1.5912259,  -3.2345476,  8.042973,    -3.6364832,  4.1242137,   9.886953,    5.4743724,
-      6.3058076,   9.369645,    -0.5175337,  4.9859877,   -7.879498,   1.358422,    -4.147944,
-      3.8984218,   5.894656,    6.4903927,   8.702036,    -8.023722,   2.802145,    -7.748032,
-      5.8461113,   -0.34215945, 11.298865,   1.4107164,   -9.949621,   -1.6257563,  -10.655836,
-      2.4528909,   1.1570255,   5.170669,    2.8398793,   7.1838694,   9.088459,    2.631155,
-      3.964414,    2.8769252,   0.04198391,  -0.16993195, 3.6747139,   -2.8377378,  6.1782537,
-      10.759618,   -4.5642614,  -8.522967,   0.8614642,   6.623416,    -1.029324,   5.5488334,
-      -7.804511,   2.128833,    7.9042315,   7.789576,    -2.7944536,  0.72271067,  -10.511495,
-      -0.78634536, -10.661714,  2.9376361,   1.9148129,   6.22859,     0.26264945,  8.028384,
-      6.8743043,   0.9351067,   7.0690722,   4.2846055,   1.4134506,   -0.18144785, 5.2778087,
-      -1.7140163,  9.217541,    8.602799,    -2.6537218,  -7.8377395,  1.1244944,   5.4540544,
-      -0.38506773, 3.9885726,   -10.76455,   1.4440702,   9.136163,    6.664117,    -5.7046547,
-      8.038592,    -9.229767,   -0.2799413,  3.6064725,   4.187257,    1.0516582,   -2.0707326,
-      -0.7615968,  -8.561018,   -3.7831352,  10.300297,   5.332594,    -6.5880876,  -4.2508664,
-      1.7985519,   5.7226253,   -4.1223383,  -9.6697855,  1.4885283,   7.524974,    1.7206005,
-      4.890457,    3.7264557,   0.4428284,   -9.922455,   -4.250455,   -6.4410596,  -2.107994,
-      -1.4109765,  -6.1325397,  0.32883006,  6.0489736,   7.7257385,   -8.281174,   1.0129383,
-      -10.792166,  8.378851,    10.802716,   9.848448,    -9.188757,   1.3151443,   1.9971865,
-      -2.521849,   4.3268294,   -7.775683,   -2.2902298,  3.0824065,   -7.17559,    9.6100855,
-      7.3965735,   -10.476525,  5.895973,    -3.6974669,  -7.6688933,  1.7354839,   -7.4045196,
-      -1.7992063,  -4.0394845,  5.2471714,   -2.250571,   2.528036,    -8.343515,   -2.2374575,
-      -10.019771,  0.73371273,  3.1853926,   2.7994921,   2.6637669,   7.620401,    7.515571,
-      0.68636256,  5.834537,    4.650282,    -1.0362619,  0.4461701,   3.7870514,   -4.1340904,
-      7.202998,    9.736904,    -3.005512,   -8.920467,   1.1228397,   6.2598724,   1.2812365,
-      4.5442104,   -8.791537,   0.92113096,  8.464749,    8.359035,    -4.3923397,  1.2252625,
-      -10.1986475, -1.4409319,  -10.013967,  3.9071581,   1.683064,    4.877419,    1.6570637,
-      9.559105,    7.3546534,   0.36635467,  5.220211,    4.6303267,   0.6601065,   0.16149978,
-      3.8818731,   -3.4438233,  8.42085,     8.659159,    -3.0935583,  -8.039611,   2.3060374,
-      5.134666,    1.0458113,   6.0190983,   -9.143728,   0.99048865,  9.210842,    6.670241,
-      -5.9614363,  0.8747396,   7.078824,    8.067469,    -10.314754,  0.45977542,  -9.28306,
-      9.1838665,   9.318644,    7.189082,    -11.092555,  1.0320464,   3.882163,    0.10953151,
-      7.9029684,   -6.9068265,  -1.3526366,  5.3996363,   -8.430931,   11.452577,   6.39663,
-      -11.090514,  4.6662245,   -3.1268113,  -8.357452,   2.2276728,   -10.357126,  -0.9291848,
-      -3.4193344,  3.1289792,   -2.5030103,  6.772719,    11.457757,   -4.2125936,  -6.684548,
-      -4.7611327,  3.6960156,   -2.3030636,  -3.0591488,  10.452471,   -4.1267314,  5.66614,
-      7.501461,    5.072407,    6.636537,    8.990381,    -0.2559256,  4.737867,    -6.2149944,
-      2.535682,    -5.5484023,  5.7113924,   3.4742818,   7.9915137,   7.0052586,   -7.156467,
-      1.4354781,   -8.286235,   5.7523417,   -2.4175215,  9.678009,    0.05066403,  -9.645226,
-      -2.2658763,  -9.518178,   4.493372,    2.3232365,   2.1659086,   0.42507997,  8.360246,
-      8.23535,     2.6878164,   5.236947,    3.4924245,   -0.6089895,  0.8884741,   4.359464,
-      -4.6073823,  7.83441,     8.958755,    -3.4690795,  -9.182282,   1.2478025,   5.6311107,
-      -1.2408862,  3.6316886,   -8.684654,   2.1078515,   7.2813864,   7.9265943,   -3.6135032,
-      0.4571511,   8.493568,    10.496853,   -7.432897,   0.8625995,   -9.607528,   7.2899456,
-      8.83158,     8.908199,    -10.300263,  1.1451302,   3.7871468,   -0.97040755, 5.7664757,
-      -8.9688,     -2.146672,   5.9641485,   -6.2908535,  10.126465,   6.1553903,   -12.066902,
-      6.301596,    -5.0419583,  -8.228695,   2.4879954,   -8.918582,   -3.7434099,  -4.1593685,
-      3.7431836,   -1.1704745,  0.5524103,   9.109399,    9.571567,    -11.209955,  1.2462777,
-      -9.554555,   9.091726,    11.477966,   7.630937,    -10.450911,  1.9205878,   5.358983,
-      -0.44546837, 6.7611346,   -9.74753,    -0.5939732,  3.8892255,   -6.437991,   10.294727,
-      5.6723895,   -10.7883,    6.192348,    -5.293862,   -10.811491,  1.0194173,   -7.074576,
-      -3.192368,   -2.5231771,  4.2791643,   -0.53309685, 0.501366,    9.636625,    7.710316,
-      -6.4219728,  1.0975566,   -8.218886,   6.9011984,   9.873679,    8.903804,    -9.316832,
-      1.2404599,   4.9039655,   1.2272617,   4.541515,    -5.2753224,  -3.2196746,  3.1303136,
-      -7.285681,   9.041425,    5.6417427,   -9.93667,    5.7548947,   -5.113397,   -8.544622,
-      4.182665,    -7.7709813,  -3.2810235,  -3.312072,   3.8900535,   -2.0604856,  6.709082,
-      -8.461194,   1.2666026,   4.8770437,   2.6955879,   3.0340345,   -1.1614609,  -3.536341,
-      -7.090382,   -5.36146,    9.072544,    6.4554095,   -4.4728956,  -1.88395,    3.1095037,
-      8.782348,    -3.316743,   -8.65248,    1.6802986,   8.186188,    2.1783829,   4.931278,
-      4.158475,    1.4033595,   -11.320101,  -3.7084908,  -6.740436,   -2.5555193,  -1.0451177,
-      -6.5569925,  0.82810307,  8.505919,    8.332857,    -9.488569,   -0.21588463, -8.056692,
-      8.493993,    7.6401625,   8.812983,    -9.377281,   2.4369764,   3.1766508,   0.6300803,
-      5.6666765,   -7.913654,   -0.42301777, 4.506412,    -7.8954244,  10.904591,   5.042256,
-      -9.626183,   8.347351,    -3.605006,   -7.923387,   1.1024277,   -8.705793,   -2.5151258,
-      -2.5066147,  4.0515003,   -2.060757,   6.2635093,   8.286584,    -6.0509276,  -6.76452,
-      -3.1158175,  1.6578803,   -1.4608748,  -1.24211,    8.151246,    -4.2970877,  6.093071,
-      7.4911637,   4.51018,     4.8425875,   9.211085,    -2.4386222,  4.5830803,   -5.6079445,
-      2.3713675,   -4.0707507,  3.1787417,   5.462342,    6.915912,    6.3928423,   -7.2970796,
-      5.0112796,   -9.140893,   4.9990606,   0.38391754,  7.7088532,   1.9340848,   8.18833,
-      8.16617,     -9.42086,    -0.3388326,  -9.659727,   8.243045,    8.099073,    8.439428,
-      -7.038694,   2.1077902,   3.3866816,   -1.9975324,  7.4972878,   -7.2525196,  -1.553731,
-      4.08758,     -6.6922374,  9.50525,     4.026735,    -9.243538,   7.2740564,   -3.9319072,
-      -6.3228955,  1.6693478,   -7.923119,   -3.7423058,  -2.2813146,  5.3469067,   -1.8285407,
-      3.3118162,   8.826356,    -4.4641976,  -6.4751124,  -9.200089,   -2.519147,   4.225298,
-      2.4105988,   -0.4344186,  0.53441775,  5.2836394,   -8.2816105,  -4.996147,   -1.6870759,
-      -7.8543897,  -3.9788852,  -7.0346904,  -3.1289773,  7.4567637,   -5.6227813,  1.0709786,
-      -8.866012,   8.427324,    -1.1755563,  -5.789216,   -8.197835,   5.3342214,   6.0646234,
-      -6.8975716,  7.717031,    3.480355,    8.312151,    -3.6645212,  -3.0976524,  -8.090359,
-      -1.9176173,  2.4257212,   1.9700835,   0.4098958,   2.1341088,   7.652741,    -9.9595585,
-      -5.989757,   0.10119354,  -7.935407,   -5.792786,   -5.22783,    -4.318978,   5.414037,
-      -6.4621663,  1.670883,    -6.9224787,  8.696932,    -2.0214002,  -6.6681314,  -8.326418,
-      4.9049683,   5.4442496,   -6.403739,   7.5822453,   7.0972915,   -9.072851,   -0.23897195,
-      1.7662339,   5.3096304,   1.983179,    -2.222645,   -0.34700772, -9.094717,   -6.107907,
-      9.525174,    8.1550665,   -5.6940084,  -4.1636486,  1.7360662,   8.528821,    -3.7299833,
-      -9.341266,   2.608542,    9.108706,    0.7978509,   4.2488184,   2.454484,    0.9446999,
-      -10.106636,  -3.8973773,  -6.6566644,  -4.5647273,  -0.99837756, -6.568582,   9.324853,
-      -7.9020953,  2.0910501,   2.2896829,   1.6790711,   1.3159255,   -3.5258796,  1.8898442,
-      -8.105812,   -4.924962,   8.771129,    7.1202874,   -5.991957,   -3.4106019,  2.4450088,
-      7.796387,    -3.055946,   -7.8971434,  1.9856719,   9.001636,    1.8511922,   3.019749,
-      3.1227696,   0.4822102,   -10.021213,  -3.530504,   -6.225959,   -3.0029628,  -1.7881511,
-      -7.3879776,  1.3925704,   9.499782,    -3.7318087,  -3.7074296,  -7.7466836,  -1.5284524,
-      4.0535855,   3.112011,    0.10340207,  -0.5429599,  6.67026,     -9.155924,   -4.924038,
-      0.64248866,  -10.0103655, -3.2742946,  -4.850029,   -3.6707063,  8.586258,    -5.855605,
-      4.906918,    -6.7813993,  7.9938135,   -2.5473144,  -5.688948,   -7.822478,   2.1421318,
-      4.66659,     -9.701272,   9.549149,    0.8998125,   -8.651497,   -0.56899565, -8.639817,
-      2.3088377,   2.1264515,   3.2764478,   2.341989,    8.594338,    8.630639,    2.8440373,
-      6.2043204,   4.433932,    0.6320018,   -1.8179281,  5.09452,     -1.5741565,  8.153934,
-      8.744339,    -3.6945698,  -8.883078,   1.5329908,   5.2745943,   0.44716078,  4.8809066,
-      -7.9594903,  1.134374,    9.233994,    6.5528665,   -4.520542,   9.477355,    -8.622195,
-      -0.23191702, 2.0485356,   3.9379985,   1.5916302,   -1.4516805,  -0.0843819,  -7.8554378,
-      -5.88308,    7.999766,    6.2572145,   -5.585321,   -4.0097756,  0.42382592,  6.160884,
-      -3.631315,   -8.333449,   2.770595,    7.8495173,   3.3331623,   4.940415,    3.6207345,
-      -0.037517,   -11.034698,  -3.185103,   -6.614664,   -3.2177854,  -2.0792234,  -6.8879867,
-      7.821685,    -8.455084,   1.0784642,   4.0033927,   2.7343264,   2.6052725,   -4.1224284,
-      -0.89305353, -6.8267674,  -4.9715133,  8.880253,    5.6994023,   -5.9695024,  -4.9181266,
-      1.3017995,   7.972617,    -3.9452884,  -10.424556,  2.4504194,   6.21529,     0.93840516,
-      4.2070026,   6.159839,    0.91979957,  -8.706724,   -4.317946,   -6.6823545,  -3.0388,
-      -2.464262,   -7.3716645,  1.3926703,   6.544412,    -5.6251183,  -5.122411,   -8.622049,
-      -2.3905911,  3.9138813,   1.9779967,   -0.05011125, 0.13310997,  7.229751,    -9.742043,
-      -8.08724,    1.2426697,   -7.9230795,  -3.3162494,  -7.129571,   -3.5488048,  7.4701195,
-      -5.2357526,  0.5917681,   -6.272206,   6.342328,    -2.909731,   -4.991607,   -8.845513,
-      3.3228495,   7.033246,    -7.8180246,  8.214469,    6.3910093,   9.185153,    -6.20472,
-      -7.713809,   -3.8481297,  3.5579286,   0.7078448,   -3.2893546,  7.384514,    -4.448121,
-      3.0104196,   9.492943,    8.024847,    4.9114385,   9.965594,    -3.014036,   5.182494,
-      -5.8806014,  2.5312455,   -5.9926524,  4.474469,    6.3717875,   6.993105,    6.493093,
-      -8.935534,   3.004074,    -8.055647,   8.315765,    -1.3026813,  8.250377,    0.02606229,
-      6.8508425,   9.655665,    -7.0116496,  -0.41060972, -10.049198,  7.897801,    6.7791023,
-      8.3362,      -9.821014,   2.491157,    3.5160472,   -1.6228812,  7.398063,    -8.769123,
-      -3.1743705,  3.2827861,   -6.497855,   10.831924,   5.2761307,   -9.704417,   4.3817043,
-      -3.9841619,  -8.111647,   1.1883026,   -8.115312,   -2.9240117,  -5.8879666,  4.20928,
-      -0.3587938,  6.935672,    -10.177582,  0.48819053,  3.1250648,   2.9306343,   3.082544,
-      -3.477687,   -1.3768549,  -7.4922366,  -3.756631,   10.039836,   3.6670392,   -5.9761434,
-      -4.4728765,  3.244255,    7.027899,    -2.3806512,  -10.4100685, 1.605716,    7.7953773,
-      0.5408159,   1.7156523,   3.824097,    -1.0604783,  -10.142124,  -5.246805,   -6.5283823,
-      -4.579547,   -2.42714,    -6.709197,   2.7782338,   7.33353,     -6.454507,   -2.9929368,
-      -7.8362985,  -2.695445,   2.4900775,   1.6682367,   0.4641757,   -1.0495365,  6.9631333,
-      -9.291356,   -8.23837,    -0.34263706, -8.275113,   -2.8454232,  -5.0864096,  -2.681942,
-      7.5450225,   -6.2517986,  0.06810654,  -6.470652,   4.9042645,   -1.8369255,  -6.6937943,
-      -7.9625087,  2.8510258,   6.180508,    -8.282598,   7.919079,    1.4897474,   6.7217417,
-      -4.2459426,  -4.114431,   -8.375707,   -2.143264,   5.6972933,   1.5574739,   0.39375135,
-      1.7930849,   5.1737595,   -7.826241,   -5.160268,   -0.80433255, -7.839536,   -5.2620406,
-      -5.4643164,  -3.185536,   6.620315,    -7.065227,   1.0524757,   -6.125088,   5.7126627,
-      -1.6161644,  -3.852159,   -9.164279,   2.7005782,   5.946544,    -8.468236,   8.2145405,
-      1.1035942,   6.590157,    -4.0461283,  -4.8090615,  -7.6702685,  -2.1121511,  5.1147075,
-      1.6128504,   2.0064135,   1.0544407,   6.0038295,   -7.8282537,  -4.801278,   0.32349443,
-      -8.0649805,  -4.372714,   -5.61336,    -5.21394,    8.176595,    -5.4753284,  1.7800134,
-      -8.267283,   7.2133374,   -0.16594432, -6.317046,   -9.490406,   4.1261597,   5.473317,
-      -7.7551675,  7.007468,    7.478628,    -8.801905,   0.10975724,  3.5478222,   4.797803,
-      1.3825226,   -3.357369,   0.99262005,  -6.94877,    -5.4781394,  9.632604,    5.7492557,
-      -5.9014316,  -3.1632116,  2.340859,    8.708098,    -3.1255999,  -8.848661,   4.5612836,
-      8.455157,    0.73460823,  4.112301,    4.392744,    -0.30759293, -6.8036823,  -3.0331545,
-      -8.269506,   -2.82415,    -0.9411246,  -5.993506,   2.1618164,   -8.716055,   -0.7432543,
-      -10.255819,  3.095418,    2.5131428,   4.752442,    0.9907621,   7.8279433,   7.85814,
-      0.50430876,  5.2840405,   4.457291,    0.03330028,  -0.40692952, 3.9244103,   -2.117118,
-      7.6977615,   8.759009,    -4.2157164,  -9.136053,   3.247858,    4.668686,    0.76162136,
-      5.3833632,   -9.231471,   0.44309422,  8.380872,    6.7211227,   -3.091507,   2.173508,
-      -9.038242,   -1.3666698,  -9.819077,   0.37825826,  2.3898845,   4.2440815,   1.9161536,
-      7.24787,     6.9124637,   1.6238527,   5.1140285,   3.1935842,   1.02845,     -1.1273454,
-      5.638998,    -2.497932,   8.342559,    8.586319,    -2.9069402,  -7.6387944,  3.5975037,
-      4.4115705,   0.41506064,  4.9078383,   -9.68327,    1.8159529,   9.744613,    8.40622,
-      -4.495336,   9.244892,    -8.789869,   1.3158468,   4.018167,    3.3922846,   2.652022,
-      -2.7495477,  0.2528986,   -8.268324,   -6.004913,   10.428784,   6.6580734,   -5.537176,
-      -1.7177434,  2.7504628,   6.7735,      -2.4454272,  -9.998361,   2.9483433,   6.8266654,
-      2.3787718,   4.472637,    2.5871701,   0.7355365,   -7.7027745,  -4.1879907,  -7.172832,
-      -4.1843605,  -0.03646783, -5.419406,   6.958486,    11.011111,   -7.1821184,  -7.956423,
-      -3.408451,   4.6850276,   -2.348787,   -4.398289,   6.9787564,   -3.8324208,  5.967827,
-      8.433518,    4.660108,    5.5657144,   9.964243,    -1.3515275,  6.404833,    -6.4805903,
-      2.4379845,   -6.0816774,  1.752272,    5.3771873,   6.9613523,   6.9788294,   -6.3894596,
-      3.7521114,   -6.8034263,  6.4458385,   -0.7233525,  10.512529,   4.362273,    9.231461,
-      -6.3382263,  -7.659,      -3.461823,   4.71463,     0.17817476,  -3.685746,   7.2962036,
-      -4.6489477,  5.218017,    11.546999,   4.7218375,   6.8498397,   9.281103,    -3.900459,
-      6.844054,    -7.0886965,  -0.05019227, -8.233724,   5.5808983,   6.374517,    8.321048,
-      7.969449,    -7.3478637,  1.4917561,   -8.003144,   4.780668,    -1.1981848,  7.753739,
-      2.0260844,   -8.880096,   -3.4258451,  -7.141975,   1.9637157,   1.814725,    5.311151,
-      1.4831505,   7.8483663,   7.257948,    1.395786,    6.417756,    5.376912,    0.59505713,
-      0.00062552,  3.6634305,   -4.159713,   7.3571978,   10.966816,   -2.5419605,  -8.466229,
-      1.904205,    5.6338267,   -0.52567476, 5.59736,     -8.361799,   0.5009981,   8.460681,
-      7.3891273,   -3.5272243,  5.0552278,   9.921456,    -7.69693,    -7.286378,   -1.9198836,
-      3.1666567,   -2.5832257,  -2.2445817,  9.888111,    -5.076563,   5.677401,    7.497946,
-      5.662994,    5.414262,    8.566503,    -2.5530663,  7.1032815,   -6.0612082,  1.3419591,
-      -4.9595256,  4.3377542,   4.3790717,   6.793512,    8.383502,    -7.1278043,  3.3240774,
-      -9.379446,   6.838661,    -0.81241214, 8.694813,    0.79141915,  7.632467,    8.575382,
-      -8.533798,   0.28954387,  -7.5675836,  5.8653326,   8.97235,     7.1649346,   -10.575289,
-      0.9359381,   5.02381,     -0.5609511,  5.543464,    -7.69131,    -2.1792977,  2.4729247,
-      -6.1917787,  10.373678,   7.6549597,   -8.809486,   5.5657206,   -3.3169382,  -8.042887,
-      2.0874746,   -7.079005,   -3.33398,    -3.6843317,  4.0172358,   -2.0754814,  1.1726758,
-      7.4618697,   6.9483604,   -8.469206,   0.7401797,   -10.318176,  8.384557,    10.5476265,
-      9.146971,    -9.250223,   0.6290606,   4.4941425,   -0.7514017,  7.2271705,   -8.309598,
-      -1.4761636,  4.0140634,   -6.021102,   9.132852,    5.6610966,   -11.249811,  8.359293,
-      -1.9445792,  -7.7393436,  -0.3931331,  -8.824441,   -2.5995944,  -2.5714035,  4.140213,
-      -3.6863053,  5.517265,    9.020411,    -4.9286127,  -7.871219,   -3.7446704,  2.5179656,
-      -1.4543481,  -2.2703636,  7.010597,    -3.6436229,  6.753862,    7.4129915,   7.1406755,
-      5.653706,    9.5445175,   0.15698843,  4.761813,    -7.698002,   1.6870106,   -4.5410123,
-      4.171763,    5.3747005,   6.341021,    7.456738,    -8.231657,   2.763487,    -9.208167,
-      6.676799,    -1.1957736,  10.062605,   4.0975976,   7.312957,    -2.4981596,  -2.9658387,
-      -8.150425,   -2.1075552,  2.64375,     1.6636052,   1.1483809,   0.09276015,  5.8556347,
-      -7.8481026,  -5.9913163,  -0.02840613, -9.937289,   -1.0486673,  -5.2340155,  -3.83912,
-      7.7165728,   -8.409944,   0.80863273,  -6.9119215,  7.5712357,   0.36031485,  -6.056131,
-      -8.470033,   1.8678337,   3.0121377,   -7.3096333,  8.205484,    5.262654,    8.774514,
-      -4.7603083,  -7.2096143,  -4.437014,   3.6080024,   -1.624254,   -4.2787876,  8.880863,
-      -4.8984556,  5.1782074,   9.944454,    3.911282,    3.5396595,   8.867042,    -1.2006199,
-      5.393288,    -5.6455317,  0.7829499,   -4.0338907,  2.479272,    6.5080743,   8.582535,
-      7.0097537,   -6.9823785,  3.984318,    -7.225381,   5.3135114,   -1.0391048,  8.951443,
-      -0.70119005, -8.510742,   -0.42949116, -10.9224825, 2.8176029,   1.6800792,   5.778404,
-      1.7269998,   7.1975236,   7.7258267,   2.7632928,   5.3399253,   3.4650044,   0.01971426,
-      -1.6468811,  4.114996,    -1.5110453,  6.8689218,   8.269899,    -3.1568048,  -7.0344677,
-      1.2911975,   5.950357,    0.19028673,  4.657226,    -8.199647,   2.246055,    8.989509,
-      5.3101015,   -4.2400866};
-
-    std::vector<float> X_embedded = {
-      -0.41849962, -0.53906363, 0.46958843,  -0.35832694, -0.23779503, -0.29751351, -0.01072748,
-      -0.21353109, -0.54769957, -0.55086273, 0.37093949,  -0.12714292, -0.06639574, -0.36098689,
-      -0.13060696, -0.07362658, -1.01205945, -0.39285606, 0.2864089,   -0.32031146, -0.19595343,
-      0.08900568,  -0.04813879, -0.06563424, -0.42655188, -0.69014251, 0.51459783,  -0.1942696,
-      -0.07767916, -0.6119386,  0.04813685,  -0.22557008, -0.56890118, -0.60293794, 0.43429622,
-      -0.09240723, -0.00624062, -0.25800395, -0.1886092,  0.01655941,  -0.01961523, -0.14147359,
-      0.41414487,  -0.8512944,  -0.61199242, -0.18586016, 0.14024924,  -0.41635606, -0.02890144,
-      0.1065347,   0.39700791,  -1.14060664, -0.95313865, 0.14416681,  0.17306046,  -0.53189689,
-      -0.98987544, -0.67918193, 0.41787854,  -0.20878236, -0.06612862, 0.03502904,  -0.03765266,
-      -0.0980606,  -0.00971657, 0.29432917,  0.36575687,  -1.1645509,  -0.89094597, 0.03718805,
-      0.2310573,   -0.38345811, -0.10401925, -0.10653082, 0.38469055,  -0.88302094, -0.80197543,
-      0.03548668,  0.02775662,  -0.54374295, 0.03379983,  0.00923623,  0.29320273,  -1.05263519,
-      -0.93360096, 0.03778313,  0.12360487,  -0.56437284, 0.0644429,   0.33432651,  0.36450726,
-      -1.22978747, -0.83822101, -0.18796451, 0.34888434,  -0.3801491,  -0.45327303, -0.59747899,
-      0.39697698,  -0.15616602, -0.06159166, -0.40301991, -0.11725303, -0.11913263, -0.12406619,
-      -0.11227967, 0.43083835,  -0.90535849, -0.81646025, 0.10012121,  -0.0141237,  -0.63747931,
-      0.04805023,  0.34190539,  0.50725192,  -1.17861414, -0.74641538, -0.09333111, 0.27992678,
-      -0.56214809, 0.04970971,  0.36249384,  0.57705611,  -1.16913795, -0.69849908, 0.10957897,
-      0.27983218,  -0.62088525, 0.0410459,   0.23973398,  0.40960434,  -1.14183664, -0.83321381,
-      0.02149482,  0.21720445,  -0.49869928, -0.95655465, -0.51680422, 0.45761383,  -0.08351214,
-      -0.12151554, 0.00819737,  -0.20813803, -0.01055793, 0.25319234,  0.36154974,  0.1822421,
-      -1.15837133, -0.92209691, -0.0501582,  0.08535917,  -0.54003763, -1.08675635, -1.04009593,
-      0.09408128,  0.07009826,  -0.01762833, -0.19180447, -0.18029785, -0.20342001, 0.04034991,
-      0.1814747,   0.36906669,  -1.13532007, -0.8852452,  0.0782818,   0.16825101,  -0.50301319,
-      -0.29128098, -0.65341312, 0.51484352,  -0.38758236, -0.22531103, -0.55021971, 0.10804344,
-      -0.3521522,  -0.38849035, -0.74110794, 0.53761131,  -0.25142813, -0.1118066,  -0.47453368,
-      0.06347904,  -0.23796193, -1.02682328, -0.47594091, 0.39515916,  -0.2782529,  -0.16566519,
-      0.08063579,  0.00810116,  -0.06213913, -1.059654,   -0.62496334, 0.53698546,  -0.11806234,
-      0.00356161,  0.11513405,  -0.14213292, 0.04102662,  -0.36622161, -0.73686272, 0.48323864,
-      -0.27338892, -0.14203401, -0.41736352, 0.03332564,  -0.21907479, -0.06396769, 0.01831361,
-      0.46263444,  -1.01878166, -0.86486858, 0.17622118,  -0.01249686, -0.74530888, -0.9354887,
-      -0.5027945,  0.38170099,  -0.15547098, 0.00677824,  -0.04677663, -0.13541745, 0.07253501,
-      -0.97933143, -0.58001202, 0.48235369,  -0.18836913, -0.02430783, 0.07572441,  -0.08101331,
-      0.00630076,  -0.16881248, -0.67989182, 0.46083611,  -0.43910736, -0.29321918, -0.38735861,
-      0.07669903,  -0.29749861, -0.40047669, -0.56722462, 0.33168188,  -0.13118173, -0.06672747,
-      -0.56856316, -0.26269144, -0.14236671, 0.10651901,  0.4962585,   0.38848072,  -1.06653547,
-      -0.64079332, -0.47378591, 0.43195483,  -0.04856951, -0.9840439,  -0.70610428, 0.34028092,
-      -0.2089237,  -0.05382041, 0.01625874,  -0.02080803, -0.12535211, -0.04146428, -1.24533033,
-      0.48944879,  0.0578458,   0.26708388,  -0.90321028, 0.35377088,  -0.36791429, -0.35382384,
-      -0.52748734, 0.42854419,  -0.31744713, -0.19174226, -0.39073724, -0.03258846, -0.19978228,
-      -0.36185205, -0.57412046, 0.43681973,  -0.25414538, -0.12904905, -0.46334973, -0.03123853,
-      -0.11303604, -0.87073672, -0.45441297, 0.41825858,  -0.25303507, -0.21845073, 0.10248682,
-      -0.11045569, -0.10002795, -0.00572806, 0.16519061,  0.42651513,  -1.11417019, -0.83789682,
-      0.02995787,  0.16843079,  -0.53874511, 0.03056994,  0.17877036,  0.49632853,  -1.03276777,
-      -0.74778616, -0.03971953, 0.10907949,  -0.67385727, -0.9523471,  -0.56550741, 0.40409449,
-      -0.2703723,  -0.10175014, 0.13605487,  -0.06306008, -0.01768126, -0.4749442,  -0.56964815,
-      0.39389887,  -0.19248079, -0.04161081, -0.38728487, -0.20341556, -0.12656988, -0.35949609,
-      -0.46137866, 0.28798422,  -0.06603147, -0.04363992, -0.60343552, -0.23565227, -0.10242701,
-      -0.06792886, 0.09689897,  0.33259571,  -0.98854214, -0.84444433, 0.00673901,  0.13457057,
-      -0.43145794, -0.51500046, -0.50821936, 0.38000089,  0.0132636,   0.0580942,   -0.40157595,
-      -0.11967677, 0.02549113,  -0.10350953, 0.22918226,  0.40411913,  -1.05619383, -0.71218503,
-      -0.02197581, 0.26422262,  -0.34765676, 0.06601537,  0.21712676,  0.34723559,  -1.20982027,
-      -0.95646334, 0.00793948,  0.27620381,  -0.43475035, -0.67326003, -0.6137197,  0.43724492,
-      -0.17666136, -0.06591748, -0.18937394, -0.07400128, -0.06881691, -0.5201112,  -0.61088628,
-      0.4225319,   -0.18969463, -0.06921366, -0.33993208, -0.06990873, -0.10288513, -0.70659858,
-      -0.56003648, 0.46628812,  -0.16090363, -0.0185108,  -0.1431348,  -0.1128775,  -0.0078648,
-      -0.02323332, 0.04292452,  0.39291084,  -0.94897962, -0.63863206, -0.16546988, 0.23698957,
-      -0.30633628};
-
-    raft::handle_t handle;
-
-    cudaStream_t stream = handle.get_stream();
-
-    rmm::device_uvector<float> d_X(X.size(), stream);
-    rmm::device_uvector<float> d_X_embedded(X_embedded.size(), stream);
-
-    raft::update_device(d_X.data(), X.data(), X.size(), stream);
-    raft::update_device(d_X_embedded.data(), X_embedded.data(), X_embedded.size(), stream);
-
-    // euclidean test
-    score = trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
-      handle, d_X.data(), d_X_embedded.data(), 50, 30, 8, 5);
-  }
-
-  void SetUp() override { basicTest(); }
-
-  void TearDown() override {}
-
- protected:
-  double score;
-};
-
-typedef TrustworthinessScoreTest TrustworthinessScoreTestF;
-TEST_F(TrustworthinessScoreTestF, Result) { ASSERT_TRUE(0.9375 < score && score < 0.9379); }
-};  // namespace Score
-};  // namespace MLCommon
diff --git a/cpp/test/prims/v_measure.cu b/cpp/test/prims/v_measure.cu
deleted file mode 100644
index 750fc0ff07..0000000000
--- a/cpp/test/prims/v_measure.cu
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <metrics/v_measure.cuh>
-#include <raft/cudart_utils.h>
-#include <random>
-
-namespace MLCommon {
-namespace Metrics {
-
-// parameter structure definition
-struct vMeasureParam {
-  int nElements;
-  int lowerLabelRange;
-  int upperLabelRange;
-  double beta;
-  bool sameArrays;
-  double tolerance;
-};
-
-// test fixture class
-template <typename T>
-class vMeasureTest : public ::testing::TestWithParam<vMeasureParam> {
- protected:
-  // the constructor
-  void SetUp() override
-  {
-    // getting the parameters
-    params = ::testing::TestWithParam<vMeasureParam>::GetParam();
-
-    nElements       = params.nElements;
-    lowerLabelRange = params.lowerLabelRange;
-    upperLabelRange = params.upperLabelRange;
-
-    // generating random value test input
-    std::vector<int> arr1(nElements, 0);
-    std::vector<int> arr2(nElements, 0);
-    std::random_device rd;
-    std::default_random_engine dre(rd());
-    std::uniform_int_distribution<int> intGenerator(lowerLabelRange, upperLabelRange);
-
-    std::generate(arr1.begin(), arr1.end(), [&]() { return intGenerator(dre); });
-    if (params.sameArrays) {
-      arr2 = arr1;
-    } else {
-      std::generate(arr2.begin(), arr2.end(), [&]() { return intGenerator(dre); });
-    }
-
-    // allocating and initializing memory to the GPU
-
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    rmm::device_uvector<T> truthClusterArray(nElements, stream);
-    rmm::device_uvector<T> predClusterArray(nElements, stream);
-    raft::update_device(truthClusterArray.data(), &arr1[0], (int)nElements, stream);
-    raft::update_device(predClusterArray.data(), &arr2[0], (int)nElements, stream);
-
-    // calculating the golden output
-    double truthHomogeity, truthCompleteness;
-
-    truthHomogeity    = MLCommon::Metrics::homogeneity_score(truthClusterArray.data(),
-                                                          predClusterArray.data(),
-                                                          nElements,
-                                                          lowerLabelRange,
-                                                          upperLabelRange,
-                                                          stream);
-    truthCompleteness = MLCommon::Metrics::homogeneity_score(predClusterArray.data(),
-                                                             truthClusterArray.data(),
-                                                             nElements,
-                                                             lowerLabelRange,
-                                                             upperLabelRange,
-                                                             stream);
-
-    if (truthCompleteness + truthHomogeity == 0.0)
-      truthVMeasure = 0.0;
-    else
-      truthVMeasure = ((1 + params.beta) * truthHomogeity * truthCompleteness /
-                       (params.beta * truthHomogeity + truthCompleteness));
-    // calling the v_measure CUDA implementation
-    computedVMeasure = MLCommon::Metrics::v_measure(truthClusterArray.data(),
-                                                    predClusterArray.data(),
-                                                    nElements,
-                                                    lowerLabelRange,
-                                                    upperLabelRange,
-                                                    stream,
-                                                    params.beta);
-  }
-
-  // the destructor
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
-  // declaring the data values
-  vMeasureParam params;
-  T lowerLabelRange, upperLabelRange;
-  int nElements           = 0;
-  double truthVMeasure    = 0;
-  double computedVMeasure = 0;
-  cudaStream_t stream     = 0;
-};
-
-// setting test parameter values
-const std::vector<vMeasureParam> inputs = {{199, 1, 10, 1.0, false, 0.000001},
-                                           {200, 15, 100, 1.0, false, 0.000001},
-                                           {100, 1, 20, 1.0, false, 0.000001},
-                                           {10, 1, 10, 1.0, false, 0.000001},
-                                           {198, 1, 100, 1.0, false, 0.000001},
-                                           {300, 3, 99, 1.0, false, 0.000001},
-                                           {199, 1, 10, 1.0, true, 0.000001},
-                                           {200, 15, 100, 1.0, true, 0.000001},
-                                           {100, 1, 20, 1.0, true, 0.000001},
-                                           {10, 1, 10, 1.0, true, 0.000001},
-                                           {198, 1, 100, 1.0, true, 0.000001},
-                                           {300, 3, 99, 1.0, true, 0.000001}};
-
-// writing the test suite
-typedef vMeasureTest<int> vMeasureTestClass;
-TEST_P(vMeasureTestClass, Result)
-{
-  ASSERT_NEAR(computedVMeasure, truthVMeasure, params.tolerance);
-}
-INSTANTIATE_TEST_CASE_P(vMeasure, vMeasureTestClass, ::testing::ValuesIn(inputs));
-
-}  // end namespace Metrics
-}  // end namespace MLCommon

From 142b3280a1f4d551a0cac35bda2bae631321f00c Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Feb 2022 15:12:22 -0500
Subject: [PATCH 02/38] Updating

---
 cpp/cmake/thirdparty/get_raft.cmake           |   4 +-
 cpp/src/arima/batched_arima.cu                |  17 +-
 cpp/src/metrics/accuracy_score.cu             |   4 +-
 cpp/src/metrics/completeness_score.cu         |   2 +-
 cpp/src/metrics/silhouette_score.cu           |   2 +-
 cpp/src/randomforest/randomforest.cuh         |  19 +-
 cpp/test/CMakeLists.txt                       |   4 +-
 .../prims/batched/information_criterion.cu    | 149 ----------------
 cpp/test/prims/columnSort.cu                  | 165 ------------------
 cpp/test/sg/hdbscan_test.cu                   |   6 +-
 10 files changed, 28 insertions(+), 344 deletions(-)
 delete mode 100644 cpp/test/prims/batched/information_criterion.cu
 delete mode 100644 cpp/test/prims/columnSort.cu

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 63f795d519..2dca5bb329 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -57,8 +57,8 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}")
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-                        FORK             rapidsai
-                        PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
+                        FORK             cjnolet
+                        PINNED_TAG       imp-2204-metrics_from_cuml
                         USE_RAFT_NN      ${CUML_USE_RAFT_NN}
                         USE_FAISS_STATIC ${CUML_USE_FAISS_STATIC}
                         )
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 9bf5cf3225..8c80838d1f 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -30,12 +30,12 @@
 
 #include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
-#include <metrics/batched/information_criterion.cuh>
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/handle.hpp>
 #include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/stats/information_criterion.hpp>
 #include <rmm/device_uvector.hpp>
 #include <timeSeries/arima_helpers.cuh>
 #include <timeSeries/fillna.cuh>
@@ -612,14 +612,13 @@ void information_criterion(raft::handle_t& handle,
     handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
 
   /* Compute information criterion from log-likelihood and base term */
-  MLCommon::Metrics::Batched::information_criterion(
-    d_ic,
-    d_ic,
-    static_cast<MLCommon::Metrics::IC_Type>(ic_type),
-    order.complexity(),
-    batch_size,
-    n_obs - order.n_diff(),
-    stream);
+  raft::stats::information_criterion_batched(d_ic,
+                                             d_ic,
+                                             static_cast<raft::stats::IC_Type>(ic_type),
+                                             order.complexity(),
+                                             batch_size,
+                                             n_obs - order.n_diff(),
+                                             stream);
 }
 
 /**
diff --git a/cpp/src/metrics/accuracy_score.cu b/cpp/src/metrics/accuracy_score.cu
index 048d4f9047..79b2e96bf5 100644
--- a/cpp/src/metrics/accuracy_score.cu
+++ b/cpp/src/metrics/accuracy_score.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <raft/stats/accuracy_score.hpp>
+#include <raft/stats/accuracy.hpp>
 
 namespace ML {
 
@@ -27,7 +27,7 @@ float accuracy_score_py(const raft::handle_t& handle,
                         const int* ref_predictions,
                         int n)
 {
-  return raft::stats::accuracy_score(predictions, ref_predictions, n, handle.get_stream());
+  return raft::stats::accuracy(predictions, ref_predictions, n, handle.get_stream());
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu
index 786be71387..f71ec7235f 100644
--- a/cpp/src/metrics/completeness_score.cu
+++ b/cpp/src/metrics/completeness_score.cu
@@ -29,7 +29,7 @@ double completeness_score(const raft::handle_t& handle,
                           const int lower_class_range,
                           const int upper_class_range)
 {
-  return raft::stats::homogeneity_score(
+  return raft::stats::completeness_score(
     y_hat, y, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 
diff --git a/cpp/src/metrics/silhouette_score.cu b/cpp/src/metrics/silhouette_score.cu
index d9c4812274..c3428be81b 100644
--- a/cpp/src/metrics/silhouette_score.cu
+++ b/cpp/src/metrics/silhouette_score.cu
@@ -47,7 +47,7 @@ float silhouette_score(const raft::handle_t& handle,
                        int chunk,
                        raft::distance::DistanceType metric)
 {
-  return raft::stats::Batched::silhouette_score_batched<float, int, int>(
+  return raft::stats::silhouette_score_batched<float, int, int>(
     handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric);
 }
 
diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh
index 97838bd53c..94141830fd 100644
--- a/cpp/src/randomforest/randomforest.cuh
+++ b/cpp/src/randomforest/randomforest.cuh
@@ -22,9 +22,10 @@
 #include <decisiontree/decisiontree.cuh>
 #include <decisiontree/treelite_util.h>
 
-#include <metrics/scores.cuh>
 #include <raft/random/permute.hpp>
 #include <raft/random/rng.hpp>
+#include <raft/stats/accuracy.hpp>
+#include <raft/stats/regression_metrics.hpp>
 
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/allocator.hpp>
@@ -279,7 +280,7 @@ class RandomForest {
     cudaStream_t stream = user_handle.get_stream();
     RF_metrics stats;
     if (rf_type == RF_type::CLASSIFICATION) {  // task classifiation: get classification metrics
-      float accuracy = MLCommon::Score::accuracy_score(predictions, ref_labels, n_rows, stream);
+      float accuracy = raft::stats::accuracy(predictions, ref_labels, n_rows, stream);
       stats          = set_rf_metrics_classification(accuracy);
       if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
 
@@ -288,13 +289,13 @@ class RandomForest {
         for each of these metrics */
     } else {  // regression task: get regression metrics
       double mean_abs_error, mean_squared_error, median_abs_error;
-      MLCommon::Score::regression_metrics(predictions,
-                                          ref_labels,
-                                          n_rows,
-                                          stream,
-                                          mean_abs_error,
-                                          mean_squared_error,
-                                          median_abs_error);
+      raft::stats::regression_metrics(predictions,
+                                      ref_labels,
+                                      n_rows,
+                                      stream,
+                                      mean_abs_error,
+                                      mean_squared_error,
+                                      median_abs_error);
       stats = set_rf_metrics_regression(mean_abs_error, mean_squared_error, median_abs_error);
       if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
     }
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 05f31c79dc..0a1dfb3eb6 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -170,11 +170,9 @@ if(BUILD_PRIMS_TESTS)
     prims/add_sub_dev_scalar.cu
           prims/batched/csr.cu
     prims/batched/gemv.cu
-    prims/batched/information_criterion.cu
-    prims/batched/make_symm.cu
+          prims/batched/make_symm.cu
     prims/batched/matrix.cu
     prims/cache.cu
-    prims/columnSort.cu
           prims/decoupled_lookback.cu
     prims/device_utils.cu
           prims/eltwise2d.cu
diff --git a/cpp/test/prims/batched/information_criterion.cu b/cpp/test/prims/batched/information_criterion.cu
deleted file mode 100644
index 9fea222059..0000000000
--- a/cpp/test/prims/batched/information_criterion.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <test_utils.h>
-
-#include <metrics/batched/information_criterion.cuh>
-
-#include <raft/cudart_utils.h>
-#include <raft/mr/device/allocator.hpp>
-
-#include <gtest/gtest.h>
-
-#include <cmath>
-#include <random>
-#include <vector>
-
-namespace MLCommon {
-namespace Metrics {
-namespace Batched {
-
-template <typename T>
-void naive_ic(
-  T* h_ic, const T* h_loglike, IC_Type ic_type, int n_params, int batch_size, int n_samples)
-{
-  T ic_base{};
-  T N = static_cast<T>(n_params);
-  T M = static_cast<T>(n_samples);
-  switch (ic_type) {
-    case AIC: ic_base = (T)2 * N; break;
-    case AICc: ic_base = (T)2 * (N + (N * (N + (T)1)) / (M - N - (T)1)); break;
-    case BIC: ic_base = std::log(M) * N; break;
-  }
-#pragma omp parallel for
-  for (int bid = 0; bid < batch_size; bid++) {
-    h_ic[bid] = ic_base - (T)2.0 * h_loglike[bid];
-  }
-}
-
-template <typename T>
-struct BatchedICInputs {
-  int batch_size;
-  int n_params;
-  int n_samples;
-  IC_Type ic_type;
-  T tolerance;
-};
-
-template <typename T>
-class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
- protected:
-  void SetUp() override
-  {
-    using std::vector;
-    params = ::testing::TestWithParam<BatchedICInputs<T>>::GetParam();
-
-    // Create stream and allocator
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    allocator = std::make_shared<raft::mr::device::default_allocator>();
-
-    // Create arrays
-    std::vector<T> loglike_h = std::vector<T>(params.batch_size);
-    res_h.resize(params.batch_size);
-    T* loglike_d = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
-    res_d        = (T*)allocator->allocate(sizeof(T) * params.batch_size, stream);
-
-    // Generate random data
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<T> udis(0.001, 1.0);  // 0 has no log
-    for (int i = 0; i < params.batch_size; i++)
-      loglike_h[i] = std::log(udis(gen));
-
-    // Copy the data to the device
-    raft::update_device(loglike_d, loglike_h.data(), params.batch_size, stream);
-
-    // Compute the tested results
-    information_criterion(res_d,
-                          loglike_d,
-                          params.ic_type,
-                          params.n_params,
-                          params.batch_size,
-                          params.n_samples,
-                          stream);
-
-    // Compute the expected results
-    naive_ic(res_h.data(),
-             loglike_h.data(),
-             params.ic_type,
-             params.n_params,
-             params.batch_size,
-             params.n_samples);
-
-    allocator->deallocate(loglike_d, sizeof(T) * params.batch_size, stream);
-  }
-
-  void TearDown() override
-  {
-    allocator->deallocate(res_d, sizeof(T) * params.batch_size, stream);
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-  }
-
- protected:
-  std::shared_ptr<raft::mr::device::default_allocator> allocator;
-  BatchedICInputs<T> params;
-  T* res_d;
-  std::vector<T> res_h;
-  cudaStream_t stream = 0;
-};
-
-// Test parameters (op, n_batches, m, n, p, q, tolerance)
-const std::vector<BatchedICInputs<double>> inputsd = {
-  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
-
-// Test parameters (op, n_batches, m, n, p, q, tolerance)
-const std::vector<BatchedICInputs<float>> inputsf = {
-  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
-
-using BatchedICTestD = BatchedICTest<double>;
-using BatchedICTestF = BatchedICTest<float>;
-TEST_P(BatchedICTestD, Result)
-{
-  ASSERT_TRUE(devArrMatchHost(
-    res_h.data(), res_d, params.batch_size, raft::CompareApprox<double>(params.tolerance), stream));
-}
-TEST_P(BatchedICTestF, Result)
-{
-  ASSERT_TRUE(devArrMatchHost(
-    res_h.data(), res_d, params.batch_size, raft::CompareApprox<float>(params.tolerance), stream));
-}
-
-INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestD, ::testing::ValuesIn(inputsd));
-INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestF, ::testing::ValuesIn(inputsf));
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/test/prims/columnSort.cu b/cpp/test/prims/columnSort.cu
deleted file mode 100644
index 8c7ba8aa97..0000000000
--- a/cpp/test/prims/columnSort.cu
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <algorithm>
-#include <gtest/gtest.h>
-#include <numeric>
-#include <raft/cudart_utils.h>
-#include <rmm/device_uvector.hpp>
-#include <selection/columnWiseSort.cuh>
-
-namespace MLCommon {
-namespace Selection {
-
-template <typename T>
-std::vector<int>* sort_indexes(const std::vector<T>& v)
-{
-  // initialize original index locations
-  std::vector<int>* idx = new std::vector<int>(v.size());
-  std::iota((*idx).begin(), (*idx).end(), 0);
-
-  // sort indexes based on comparing values in v
-  std::sort((*idx).begin(), (*idx).end(), [&v](int i1, int i2) { return v[i1] < v[i2]; });
-  return idx;
-}
-
-template <typename T>
-struct columnSort {
-  T tolerance;
-  int n_row;
-  int n_col;
-  bool testKeys;
-};
-
-template <typename T>
-::std::ostream& operator<<(::std::ostream& os, const columnSort<T>& dims)
-{
-  return os;
-}
-
-template <typename T>
-class ColumnSort : public ::testing::TestWithParam<columnSort<T>> {
- protected:
-  ColumnSort()
-    : keyIn(0, stream),
-      keySorted(0, stream),
-      keySortGolden(0, stream),
-      valueOut(0, stream),
-      goldenValOut(0, stream),
-      workspacePtr(0, stream)
-  {
-  }
-
-  void SetUp() override
-  {
-    params  = ::testing::TestWithParam<columnSort<T>>::GetParam();
-    int len = params.n_row * params.n_col;
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    keyIn.resize(len, stream);
-    valueOut.resize(len, stream);
-    goldenValOut.resize(len, stream);
-    if (params.testKeys) {
-      keySorted.resize(len, stream);
-      keySortGolden.resize(len, stream);
-    }
-
-    std::vector<T> vals(len);
-    std::vector<int> cValGolden(len);
-    std::iota(vals.begin(), vals.end(),
-              1.0f);  // will have to change input param type
-    std::random_shuffle(vals.begin(), vals.end());
-
-    std::vector<T> cKeyGolden(len);
-
-    for (int i = 0; i < params.n_row; i++) {
-      std::vector<T> tmp(vals.begin() + i * params.n_col, vals.begin() + (i + 1) * params.n_col);
-      auto cpuOut = sort_indexes(tmp);
-      std::copy((*cpuOut).begin(), (*cpuOut).end(), cValGolden.begin() + i * params.n_col);
-      delete cpuOut;
-
-      if (params.testKeys) {
-        std::sort(tmp.begin(), tmp.end());
-        std::copy(tmp.begin(), tmp.end(), cKeyGolden.begin() + i * params.n_col);
-      }
-    }
-
-    raft::update_device(keyIn.data(), &vals[0], len, stream);
-    raft::update_device(goldenValOut.data(), &cValGolden[0], len, stream);
-
-    if (params.testKeys) raft::update_device(keySortGolden.data(), &cKeyGolden[0], len, stream);
-
-    bool needWorkspace   = false;
-    size_t workspaceSize = 0;
-    // Remove this branch once the implementation of descending sort is fixed.
-    sortColumnsPerRow(keyIn.data(),
-                      valueOut.data(),
-                      params.n_row,
-                      params.n_col,
-                      needWorkspace,
-                      NULL,
-                      workspaceSize,
-                      stream,
-                      keySorted.data());
-    if (needWorkspace) {
-      workspacePtr.resize(workspaceSize, stream);
-      sortColumnsPerRow(keyIn.data(),
-                        valueOut.data(),
-                        params.n_row,
-                        params.n_col,
-                        needWorkspace,
-                        workspacePtr.data(),
-                        workspaceSize,
-                        stream,
-                        keySorted.data());
-    }
-    RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-  }
-
- protected:
-  cudaStream_t stream = 0;
-  columnSort<T> params;
-  rmm::device_uvector<T> keyIn, keySorted, keySortGolden;
-  rmm::device_uvector<int> valueOut, goldenValOut;  // valueOut are indexes
-  rmm::device_uvector<char> workspacePtr;
-};
-
-const std::vector<columnSort<float>> inputsf1 = {{0.000001f, 503, 2000, false},
-                                                 {0.000001f, 113, 20000, true},
-                                                 {0.000001f, 503, 2000, false},
-                                                 {0.000001f, 113, 20000, true}};
-
-typedef ColumnSort<float> ColumnSortF;
-TEST_P(ColumnSortF, Result)
-{
-  // Remove this condition once the implementation of of descending sort is
-  // fixed.
-  ASSERT_TRUE(devArrMatch(valueOut.data(),
-                          goldenValOut.data(),
-                          params.n_row * params.n_col,
-                          raft::CompareApprox<float>(params.tolerance)));
-  if (params.testKeys) {
-    ASSERT_TRUE(devArrMatch(keySorted.data(),
-                            keySortGolden.data(),
-                            params.n_row * params.n_col,
-                            raft::CompareApprox<float>(params.tolerance)));
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(ColumnSortTests, ColumnSortF, ::testing::ValuesIn(inputsf1));
-
-}  // end namespace Selection
-}  // end namespace MLCommon
diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu
index b8df86f0f9..832bb42a37 100644
--- a/cpp/test/sg/hdbscan_test.cu
+++ b/cpp/test/sg/hdbscan_test.cu
@@ -26,7 +26,7 @@
 #include <hdbscan/detail/extract.cuh>
 #include <hdbscan/detail/utils.h>
 
-#include <metrics/adjusted_rand_index.cuh>
+#include <raft/stats/adjusted_rand_index.hpp>
 
 #include <raft/sparse/hierarchy/detail/agglomerative.cuh>
 
@@ -107,7 +107,7 @@ class HDBSCANTest : public ::testing::TestWithParam<HDBSCANInputs<T, IdxT>> {
 
     handle.sync_stream(handle.get_stream());
 
-    score = MLCommon::Metrics::compute_adjusted_rand_index(
+    score = raft::stats::adjusted_rand_index(
       out.get_labels(), labels_ref.data(), params.n_row, handle.get_stream());
   }
 
@@ -304,7 +304,7 @@ class ClusterSelectionTest : public ::testing::TestWithParam<ClusterSelectionInp
 
     rmm::device_uvector<IdxT> labels_ref(params.n_row, handle.get_stream());
     raft::update_device(labels_ref.data(), params.labels.data(), params.n_row, handle.get_stream());
-    score = MLCommon::Metrics::compute_adjusted_rand_index(
+    score = raft::stats::adjusted_rand_index(
       labels.data(), labels_ref.data(), params.n_row, handle.get_stream());
     handle.sync_stream(handle.get_stream());
   }

From 9722d08c10f9a638e1f0aec966a430ff26ec3058 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Feb 2022 16:02:53 -0500
Subject: [PATCH 03/38] Adding knn back

---
 cpp/src_prims/selection/knn.cuh | 345 ++++++++++++++++++++++++++++++++
 1 file changed, 345 insertions(+)
 create mode 100644 cpp/src_prims/selection/knn.cuh

diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
new file mode 100644
index 0000000000..04070c75e1
--- /dev/null
+++ b/cpp/src_prims/selection/knn.cuh
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <label/classlabels.cuh>
+
+#include <cuml/neighbors/knn.hpp>
+
+#include <raft/cuda_utils.cuh>
+#include <raft/cudart_utils.h>
+#include <raft/distance/distance.hpp>
+#include <raft/distance/distance_type.hpp>
+#include <raft/mr/device/allocator.hpp>
+
+#include <faiss/gpu/GpuDistance.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+#include <faiss/gpu/GpuIndexIVFScalarQuantizer.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/utils/Limits.cuh>
+#include <faiss/gpu/utils/Select.cuh>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <faiss/utils/Heap.h>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/transform_iterator.h>
+
+#include <cstddef>
+#include <iostream>
+#include <set>
+
+namespace MLCommon {
+namespace Selection {
+
+template <bool precomp_lbls, typename T>
+inline __device__ T get_lbls(const T* labels, const int64_t* knn_indices, int64_t idx)
+{
+  if (precomp_lbls) {
+    return labels[idx];
+  } else {
+    int64_t neighbor_idx = knn_indices[idx];
+    return labels[neighbor_idx];
+  }
+}
+
+template <typename OutType = float, bool precomp_lbls = false>
+__global__ void class_probs_kernel(OutType* out,
+                                   const int64_t* knn_indices,
+                                   const int* labels,
+                                   int n_uniq_labels,
+                                   std::size_t n_samples,
+                                   int n_neighbors)
+{
+  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int i   = row * n_neighbors;
+
+  float n_neigh_inv = 1.0f / n_neighbors;
+
+  if (row >= n_samples) return;
+
+  for (int j = 0; j < n_neighbors; j++) {
+    int out_label = get_lbls<precomp_lbls>(labels, knn_indices, i + j);
+    int out_idx   = row * n_uniq_labels + out_label;
+    out[out_idx] += n_neigh_inv;
+  }
+}
+
+template <typename OutType = int>
+__global__ void class_vote_kernel(OutType* out,
+                                  const float* class_proba,
+                                  int* unique_labels,
+                                  int n_uniq_labels,
+                                  std::size_t n_samples,
+                                  int n_outputs,
+                                  int output_offset,
+                                  bool use_shared_mem)
+{
+  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int i   = row * n_uniq_labels;
+
+  extern __shared__ int label_cache[];
+  if (use_shared_mem) {
+    for (int j = threadIdx.x; j < n_uniq_labels; j += blockDim.x) {
+      label_cache[j] = unique_labels[j];
+    }
+
+    __syncthreads();
+  }
+
+  if (row >= n_samples) return;
+  float cur_max = -1.0;
+  int cur_label = -1;
+  for (int j = 0; j < n_uniq_labels; j++) {
+    float cur_proba = class_proba[i + j];
+    if (cur_proba > cur_max) {
+      cur_max   = cur_proba;
+      cur_label = j;
+    }
+  }
+
+  int val = use_shared_mem ? label_cache[cur_label] : unique_labels[cur_label];
+
+  out[row * n_outputs + output_offset] = val;
+}
+
+template <typename LabelType, bool precomp_lbls = false>
+__global__ void regress_avg_kernel(LabelType* out,
+                                   const int64_t* knn_indices,
+                                   const LabelType* labels,
+                                   std::size_t n_samples,
+                                   int n_neighbors,
+                                   int n_outputs,
+                                   int output_offset)
+{
+  int row = (blockIdx.x * blockDim.x) + threadIdx.x;
+  int i   = row * n_neighbors;
+
+  if (row >= n_samples) return;
+
+  LabelType pred = 0;
+  for (int j = 0; j < n_neighbors; j++) {
+    pred += get_lbls<precomp_lbls>(labels, knn_indices, i + j);
+  }
+
+  out[row * n_outputs + output_offset] = pred / (LabelType)n_neighbors;
+}
+
+/**
+ * A naive knn classifier to predict probabilities
+ * @tparam TPB_X number of threads per block to use. each thread
+ *               will process a single row of knn_indices
+ * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Classifier. In this case,
+ *         the knn_indices array is not used as the y arrays already store the labels for each row.
+ *         This makes it possible to compute the reduction step without holding all the data on a
+ * single machine.
+ * @param[out] out vector of output class probabilities of the same size as y.
+ *            each element should be of size size (n_samples * n_classes[i])
+ * @param[in] knn_indices the index array resulting from a knn search
+ * @param[in] y vector of label arrays. for multulabel classification,
+ *          each output in the vector is a different array of labels
+ *          corresponding to the i'th output.
+ * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
+ * @param[in] n_query_rows number of rows in knn_indices
+ * @param[in] k number of neighbors in knn_indices
+ * @param[in] uniq_labels vector of the sorted unique labels for each array in y
+ * @param[in] n_unique vector of sizes for each array in uniq_labels
+ * @param[in] user_stream main stream to use for queuing isolated CUDA events
+ * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
+ * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
+ *        the user_stream is used.
+ */
+template <int TPB_X = 32, bool precomp_lbls = false>
+void class_probs(const raft::handle_t& handle,
+                 std::vector<float*>& out,
+                 const int64_t* knn_indices,
+                 std::vector<int*>& y,
+                 std::size_t n_index_rows,
+                 std::size_t n_query_rows,
+                 int k,
+                 std::vector<int*>& uniq_labels,
+                 std::vector<int>& n_unique)
+{
+  for (std::size_t i = 0; i < y.size(); i++) {
+    cudaStream_t stream = handle.get_next_usable_stream();
+
+    int n_unique_labels = n_unique[i];
+    size_t cur_size     = n_query_rows * n_unique_labels;
+
+    RAFT_CUDA_TRY(cudaMemsetAsync(out[i], 0, cur_size * sizeof(float), stream));
+
+    dim3 grid(raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), 1, 1);
+    dim3 blk(TPB_X, 1, 1);
+
+    /**
+     * Build array of class probability arrays from
+     * knn_indices and labels
+     */
+    rmm::device_uvector<int> y_normalized(n_index_rows + n_unique_labels, stream);
+
+    /*
+     * Appending the array of unique labels to the original labels array
+     * to prevent make_monotonic function from producing misleading results
+     * due to the absence of some of the unique labels in the labels array
+     */
+    rmm::device_uvector<int> y_tmp(n_index_rows + n_unique_labels, stream);
+    raft::update_device(y_tmp.data(), y[i], n_index_rows, stream);
+    raft::update_device(y_tmp.data() + n_index_rows, uniq_labels[i], n_unique_labels, stream);
+
+    MLCommon::Label::make_monotonic(y_normalized.data(), y_tmp.data(), y_tmp.size(), stream);
+    raft::linalg::unaryOp<int>(
+      y_normalized.data(),
+      y_normalized.data(),
+      n_index_rows,
+      [] __device__(int input) { return input - 1; },
+      stream);
+    class_probs_kernel<float, precomp_lbls><<<grid, blk, 0, stream>>>(
+      out[i], knn_indices, y_normalized.data(), n_unique_labels, n_query_rows, k);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+/**
+ * KNN classifier using voting based on the statistical mode of classes.
+ * In the event of a tie, the class with the lowest index in the sorted
+ * array of unique monotonically increasing labels will be used.
+ *
+ * @tparam TPB_X the number of threads per block to use
+ * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Classifier. In this case,
+ * the knn_indices array is not used as the y arrays already store the labels for each row.
+ * This makes it possible to compute the reduction step without holding all the data on a single
+ * machine.
+ * @param[out] out output array of size (n_samples * y.size())
+ * @param[in] knn_indices index array from knn search
+ * @param[in] y vector of label arrays. for multilabel classification, each
+ *          element in the vector is a different "output" array of labels corresponding
+ *          to the i'th output.
+ * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
+ * @param[in] n_query_rows number of rows in knn_indices
+ * @param[in] k number of neighbors in knn_indices
+ * @param[in] uniq_labels vector of the sorted unique labels for each array in y
+ * @param[in] n_unique vector of sizes for each array in uniq_labels
+ * @param[in] user_stream main stream to use for queuing isolated CUDA events
+ * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
+ * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
+ *        the user_stream is used.
+ */
+template <int TPB_X = 32, bool precomp_lbls = false>
+void knn_classify(const raft::handle_t& handle,
+                  int* out,
+                  const int64_t* knn_indices,
+                  std::vector<int*>& y,
+                  std::size_t n_index_rows,
+                  std::size_t n_query_rows,
+                  int k,
+                  std::vector<int*>& uniq_labels,
+                  std::vector<int>& n_unique)
+{
+  std::vector<float*> probs;
+  std::vector<rmm::device_uvector<float>> tmp_probs;
+
+  // allocate temporary memory
+  for (std::size_t i = 0; i < n_unique.size(); i++) {
+    int size = n_unique[i];
+
+    cudaStream_t stream = handle.get_next_usable_stream(i);
+
+    tmp_probs.emplace_back(n_query_rows * size, stream);
+    probs.push_back(tmp_probs.back().data());
+  }
+
+  /**
+   * Compute class probabilities
+   *
+   * Note: Since class_probs will use the same round robin strategy for distributing
+   * work to the streams, we don't need to explicitly synchronize the streams here.
+   */
+  class_probs<32, precomp_lbls>(
+    handle, probs, knn_indices, y, n_index_rows, n_query_rows, k, uniq_labels, n_unique);
+
+  dim3 grid(raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), 1, 1);
+  dim3 blk(TPB_X, 1, 1);
+
+  for (std::size_t i = 0; i < y.size(); i++) {
+    cudaStream_t stream = handle.get_next_usable_stream(i);
+
+    int n_unique_labels = n_unique[i];
+
+    /**
+     * Choose max probability
+     */
+    // Use shared memory for label lookups if the number of classes is small enough
+    int smem            = sizeof(int) * n_unique_labels;
+    bool use_shared_mem = smem < raft::getSharedMemPerBlock();
+
+    class_vote_kernel<<<grid, blk, use_shared_mem ? smem : 0, stream>>>(
+      out, probs[i], uniq_labels[i], n_unique_labels, n_query_rows, y.size(), i, use_shared_mem);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+/**
+ * KNN regression using voting based on the mean of the labels for the
+ * nearest neighbors.
+ * @tparam ValType data type of the labels
+ * @tparam TPB_X the number of threads per block to use
+ * @tparam precomp_lbls is set to true for the reduction step of MNMG KNN Regressor. In this case,
+ * the knn_indices array is not used as the y arrays already store the output for each row.
+ * This makes it possible to compute the reduction step without holding all the data on a single
+ * machine.
+ * @param[out] out output array of size (n_samples * y.size())
+ * @param[in] knn_indices index array from knn search
+ * @param[in] y vector of label arrays. for multilabel classification, each
+ *          element in the vector is a different "output" array of labels corresponding
+ *          to the i'th output.
+ * @param[in] n_index_rows number of vertices in index (eg. size of each y array)
+ * @param[in] n_query_rows number of rows in knn_indices
+ * @param[in] k number of neighbors in knn_indices
+ * @param[in] user_stream main stream to use for queuing isolated CUDA events
+ * @param[in] int_streams internal streams to use for parallelizing independent CUDA events.
+ * @param[in] n_int_streams number of elements in int_streams array. If this is less than 1,
+ *        the user_stream is used.
+ */
+
+template <typename ValType, int TPB_X = 32, bool precomp_lbls = false>
+void knn_regress(const raft::handle_t& handle,
+                 ValType* out,
+                 const int64_t* knn_indices,
+                 const std::vector<ValType*>& y,
+                 size_t n_index_rows,
+                 size_t n_query_rows,
+                 int k)
+{
+  /**
+   * Vote average regression value
+   */
+  for (std::size_t i = 0; i < y.size(); i++) {
+    cudaStream_t stream = handle.get_next_usable_stream();
+
+    regress_avg_kernel<ValType, precomp_lbls>
+      <<<raft::ceildiv(n_query_rows, static_cast<std::size_t>(TPB_X)), TPB_X, 0, stream>>>(
+        out, knn_indices, y[i], n_query_rows, k, y.size(), i);
+
+    handle.sync_stream(stream);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
+}
+
+};  // namespace Selection
+};  // namespace MLCommon
\ No newline at end of file

From 8b1de6be448afdb6bd64894204fa52df5dfe96c1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 16 Feb 2022 16:51:45 -0500
Subject: [PATCH 04/38] Updating copyrights

---
 cpp/src/metrics/accuracy_score.cu      | 2 +-
 cpp/src/metrics/adjusted_rand_index.cu | 2 +-
 cpp/src/metrics/completeness_score.cu  | 2 +-
 cpp/src/metrics/entropy.cu             | 2 +-
 cpp/src/metrics/homogeneity_score.cu   | 2 +-
 cpp/src/metrics/kl_divergence.cu       | 2 +-
 cpp/src/metrics/mutual_info_score.cu   | 2 +-
 cpp/src/metrics/r2_score.cu            | 2 +-
 cpp/src/metrics/rand_index.cu          | 2 +-
 cpp/src/metrics/trustworthiness.cu     | 2 +-
 cpp/src/metrics/v_measure.cu           | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/src/metrics/accuracy_score.cu b/cpp/src/metrics/accuracy_score.cu
index 79b2e96bf5..bf2fe46791 100644
--- a/cpp/src/metrics/accuracy_score.cu
+++ b/cpp/src/metrics/accuracy_score.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/adjusted_rand_index.cu b/cpp/src/metrics/adjusted_rand_index.cu
index f2969663d8..13045dc48b 100644
--- a/cpp/src/metrics/adjusted_rand_index.cu
+++ b/cpp/src/metrics/adjusted_rand_index.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu
index f71ec7235f..c10caa4892 100644
--- a/cpp/src/metrics/completeness_score.cu
+++ b/cpp/src/metrics/completeness_score.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/entropy.cu b/cpp/src/metrics/entropy.cu
index a82ff1df9d..2fdb8c7729 100644
--- a/cpp/src/metrics/entropy.cu
+++ b/cpp/src/metrics/entropy.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/homogeneity_score.cu b/cpp/src/metrics/homogeneity_score.cu
index f902834821..06f9a2bbb0 100644
--- a/cpp/src/metrics/homogeneity_score.cu
+++ b/cpp/src/metrics/homogeneity_score.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/kl_divergence.cu b/cpp/src/metrics/kl_divergence.cu
index 7e80f01c6a..d1d0643f3d 100644
--- a/cpp/src/metrics/kl_divergence.cu
+++ b/cpp/src/metrics/kl_divergence.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/mutual_info_score.cu b/cpp/src/metrics/mutual_info_score.cu
index 3b98654907..1eca35346a 100644
--- a/cpp/src/metrics/mutual_info_score.cu
+++ b/cpp/src/metrics/mutual_info_score.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/r2_score.cu b/cpp/src/metrics/r2_score.cu
index ce3f99fb02..61d60a369a 100644
--- a/cpp/src/metrics/r2_score.cu
+++ b/cpp/src/metrics/r2_score.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/rand_index.cu b/cpp/src/metrics/rand_index.cu
index 8cc4af3ff8..a1ff13f8f0 100644
--- a/cpp/src/metrics/rand_index.cu
+++ b/cpp/src/metrics/rand_index.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index c3c4a644df..3d33f6e0b5 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/metrics/v_measure.cu b/cpp/src/metrics/v_measure.cu
index a979e988fc..f8bf5e1e62 100644
--- a/cpp/src/metrics/v_measure.cu
+++ b/cpp/src/metrics/v_measure.cu
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 5ea7a919072b1c103f02865daf0db1204ac8fcc3 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Feb 2022 16:09:07 -0500
Subject: [PATCH 05/38] Adding specializations for knn gtests

---
 cpp/test/prims/knn_classify.cu   | 1 +
 cpp/test/prims/knn_regression.cu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cpp/test/prims/knn_classify.cu b/cpp/test/prims/knn_classify.cu
index 666ee8a712..5c0899a658 100644
--- a/cpp/test/prims/knn_classify.cu
+++ b/cpp/test/prims/knn_classify.cu
@@ -22,6 +22,7 @@
 #include <raft/label/classlabels.hpp>
 #include <raft/random/make_blobs.hpp>
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 #include <rmm/device_uvector.hpp>
 #include <selection/knn.cuh>
 #include <vector>
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index 01cfad6775..5ad96a1674 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -23,6 +23,7 @@
 #include <raft/linalg/reduce.hpp>
 #include <raft/random/rng.hpp>
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 #include <rmm/device_uvector.hpp>
 #include <selection/knn.cuh>
 #include <vector>

From 11e50ca259baeb99ecb67e4dd621b02c558f98ab Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Feb 2022 16:10:10 -0500
Subject: [PATCH 06/38] adding more specializations

---
 cpp/src/knn/knn_opg_common.cuh | 1 +
 cpp/src/knn/knn_sparse.cu      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index bc231a5cea..1017009b91 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -29,6 +29,7 @@
 #include <raft/cudart_utils.h>
 #include <raft/mr/device/allocator.hpp>
 #include <raft/spatial/knn/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <cstddef>
 #include <memory>
diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index 22fe9a186e..5546355c72 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -18,6 +18,7 @@
 #include <cuml/neighbors/knn_sparse.hpp>
 
 #include <raft/sparse/selection/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <cusparse_v2.h>
 

From 7cc8d21d773e8f13fa69aabd9cec87c5172c9e1f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Feb 2022 16:18:08 -0500
Subject: [PATCH 07/38] More specializations

---
 cpp/src/hdbscan/hdbscan.cu  | 1 +
 cpp/src/tsne/tsne.cu        | 1 +
 cpp/src/umap/umap.cu        | 1 +
 cpp/test/sg/hdbscan_test.cu | 1 +
 cpp/test/sg/knn_test.cu     | 1 +
 5 files changed, 5 insertions(+)

diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu
index 46c6b18600..52ef3c3e6c 100644
--- a/cpp/src/hdbscan/hdbscan.cu
+++ b/cpp/src/hdbscan/hdbscan.cu
@@ -16,6 +16,7 @@
 
 #include "detail/condense.cuh"
 #include <cuml/cluster/hdbscan.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include "runner.h"
 
diff --git a/cpp/src/tsne/tsne.cu b/cpp/src/tsne/tsne.cu
index 378e854a3e..4a8f8bc377 100644
--- a/cpp/src/tsne/tsne.cu
+++ b/cpp/src/tsne/tsne.cu
@@ -16,6 +16,7 @@
 
 #include "tsne_runner.cuh"
 #include <cuml/manifold/tsne.h>
+#include <raft/spatial/knn/specializations.hpp>
 
 namespace ML {
 
diff --git a/cpp/src/umap/umap.cu b/cpp/src/umap/umap.cu
index 08eb3bcfc0..23c63890d1 100644
--- a/cpp/src/umap/umap.cu
+++ b/cpp/src/umap/umap.cu
@@ -18,6 +18,7 @@
 #include <cuml/manifold/common.hpp>
 #include <cuml/manifold/umap.hpp>
 #include <cuml/manifold/umapparams.h>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <raft/cuda_utils.cuh>
 
diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu
index 832bb42a37..176b25b55a 100644
--- a/cpp/test/sg/hdbscan_test.cu
+++ b/cpp/test/sg/hdbscan_test.cu
@@ -27,6 +27,7 @@
 #include <hdbscan/detail/utils.h>
 
 #include <raft/stats/adjusted_rand_index.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <raft/sparse/hierarchy/detail/agglomerative.cuh>
 
diff --git a/cpp/test/sg/knn_test.cu b/cpp/test/sg/knn_test.cu
index 9c56d47fdd..61aa13992f 100644
--- a/cpp/test/sg/knn_test.cu
+++ b/cpp/test/sg/knn_test.cu
@@ -26,6 +26,7 @@
 #include <cuml/datasets/make_blobs.hpp>
 
 #include <cuml/neighbors/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 namespace ML {
 

From e08dea6bc07709facddb95fd49a9c72e31a0ebd7 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 17 Feb 2022 16:57:33 -0500
Subject: [PATCH 08/38] Updating style

---
 cpp/src/knn/knn_sparse.cu   | 2 +-
 cpp/test/sg/hdbscan_test.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index 5546355c72..67462fc769 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu
index 176b25b55a..d51d34ccae 100644
--- a/cpp/test/sg/hdbscan_test.cu
+++ b/cpp/test/sg/hdbscan_test.cu
@@ -26,8 +26,8 @@
 #include <hdbscan/detail/extract.cuh>
 #include <hdbscan/detail/utils.h>
 
-#include <raft/stats/adjusted_rand_index.hpp>
 #include <raft/spatial/knn/specializations.hpp>
+#include <raft/stats/adjusted_rand_index.hpp>
 
 #include <raft/sparse/hierarchy/detail/agglomerative.cuh>
 

From 41a1c5ec8723de7dbc409992bcb6578989ffa358 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Feb 2022 13:58:20 -0500
Subject: [PATCH 09/38] No longer building brute_force_knn prim anywhere

---
 cpp/src/hdbscan/detail/reachability.cuh | 1 +
 cpp/src/metrics/trustworthiness.cu      | 2 ++
 cpp/src/umap/knn_graph/algo.cuh         | 3 ++-
 cpp/test/prims/knn_classify.cu          | 2 +-
 cpp/test/prims/knn_regression.cu        | 2 +-
 cpp/test/sg/tsne_test.cu                | 1 +
 cpp/test/sg/umap_parametrizable_test.cu | 3 ++-
 7 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 73756c3bfa..2324d10cc3 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -32,6 +32,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuml/neighbors/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 #include <raft/distance/distance.hpp>
 
 #include <thrust/transform.h>
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index 3d33f6e0b5..502dc0aeb6 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -20,6 +20,7 @@
 
 #include <raft/distance/distance.hpp>
 #include <raft/distance/specializations.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <raft/handle.hpp>
 
@@ -54,6 +55,7 @@ double trustworthiness_score(const raft::handle_t& h,
     h, X, X_embedded, n, m, d, n_neighbors, batchSize);
 }
 
+
 template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
   const raft::handle_t& h,
   const float* X,
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index 632970cdab..265dbf8954 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -23,6 +23,7 @@
 #include <raft/distance/distance_type.hpp>
 #include <raft/linalg/unary_op.hpp>
 #include <raft/sparse/selection/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 #include <selection/knn.cuh>
 
 #include <raft/cudart_utils.h>
@@ -61,7 +62,7 @@ void launcher(const raft::handle_t& handle,
   ptrs[0]  = inputsA.X;
   sizes[0] = inputsA.n;
 
-  raft::spatial::knn::brute_force_knn(handle,
+  raft::spatial::knn::brute_force_knn<long, float, int>(handle,
                                       ptrs,
                                       sizes,
                                       inputsA.d,
diff --git a/cpp/test/prims/knn_classify.cu b/cpp/test/prims/knn_classify.cu
index 5c0899a658..916742bd52 100644
--- a/cpp/test/prims/knn_classify.cu
+++ b/cpp/test/prims/knn_classify.cu
@@ -75,7 +75,7 @@ class KNNClassifyTest : public ::testing::TestWithParam<KNNClassifyInputs> {
     ptrs[0]  = train_samples.data();
     sizes[0] = params.rows;
 
-    raft::spatial::knn::brute_force_knn(handle,
+    raft::spatial::knn::brute_force_knn<long, float, int>(handle,
                                         ptrs,
                                         sizes,
                                         params.cols,
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index 5ad96a1674..0e5c3b1c02 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -100,7 +100,7 @@ class KNNRegressionTest : public ::testing::TestWithParam<KNNRegressionInputs> {
     ptrs[0]  = train_samples.data();
     sizes[0] = params.rows;
 
-    raft::spatial::knn::brute_force_knn(handle,
+    raft::spatial::knn::brute_force_knn<long, float, int>(handle,
                                         ptrs,
                                         sizes,
                                         params.cols,
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index c7c79d8535..5964f7220d 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -29,6 +29,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <tsne/distances.cuh>
+#include <raft/spatial/knn/specializations.hpp>
 #include <tsne/tsne_runner.cuh>
 #include <tsne/utils.cuh>
 #include <vector>
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index ffeec27eec..6788981215 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -23,6 +23,7 @@
 #include <cuml/manifold/umapparams.h>
 #include <cuml/metrics/metrics.hpp>
 #include <cuml/neighbors/knn.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 #include <datasets/digits.h>
 #include <raft/cudart_utils.h>
 #include <test_utils.h>
@@ -142,7 +143,7 @@ class UMAPParametrizableTest : public ::testing::Test {
       ptrs[0]  = X;
       sizes[0] = n_samples;
 
-      raft::spatial::knn::brute_force_knn(handle,
+      raft::spatial::knn::brute_force_knn<long, float, int>(handle,
                                           ptrs,
                                           sizes,
                                           n_features,

From a421dab0c8da86f0c1ec2ff0a12e4304a844927a Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Feb 2022 13:59:08 -0500
Subject: [PATCH 10/38] Fixing style

---
 cpp/src/hdbscan/detail/reachability.cuh |  2 +-
 cpp/src/metrics/trustworthiness.cu      |  1 -
 cpp/src/umap/knn_graph/algo.cuh         | 16 ++++++++--------
 cpp/test/prims/knn_classify.cu          | 16 ++++++++--------
 cpp/test/prims/knn_regression.cu        | 16 ++++++++--------
 cpp/test/sg/tsne_test.cu                |  2 +-
 cpp/test/sg/umap_parametrizable_test.cu | 18 +++++++++---------
 7 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 2324d10cc3..7ced12663f 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -32,8 +32,8 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuml/neighbors/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
 #include <raft/distance/distance.hpp>
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index 502dc0aeb6..515bd79003 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -55,7 +55,6 @@ double trustworthiness_score(const raft::handle_t& h,
     h, X, X_embedded, n, m, d, n_neighbors, batchSize);
 }
 
-
 template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
   const raft::handle_t& h,
   const float* X,
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index 265dbf8954..d6e17b4a43 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -63,14 +63,14 @@ void launcher(const raft::handle_t& handle,
   sizes[0] = inputsA.n;
 
   raft::spatial::knn::brute_force_knn<long, float, int>(handle,
-                                      ptrs,
-                                      sizes,
-                                      inputsA.d,
-                                      inputsB.X,
-                                      inputsB.n,
-                                      out.knn_indices,
-                                      out.knn_dists,
-                                      n_neighbors);
+                                                        ptrs,
+                                                        sizes,
+                                                        inputsA.d,
+                                                        inputsB.X,
+                                                        inputsB.n,
+                                                        out.knn_indices,
+                                                        out.knn_dists,
+                                                        n_neighbors);
 }
 
 // Instantiation for dense inputs, int indices
diff --git a/cpp/test/prims/knn_classify.cu b/cpp/test/prims/knn_classify.cu
index 916742bd52..df73d352e1 100644
--- a/cpp/test/prims/knn_classify.cu
+++ b/cpp/test/prims/knn_classify.cu
@@ -76,14 +76,14 @@ class KNNClassifyTest : public ::testing::TestWithParam<KNNClassifyInputs> {
     sizes[0] = params.rows;
 
     raft::spatial::knn::brute_force_knn<long, float, int>(handle,
-                                        ptrs,
-                                        sizes,
-                                        params.cols,
-                                        train_samples.data(),
-                                        params.rows,
-                                        knn_indices.data(),
-                                        knn_dists.data(),
-                                        params.k);
+                                                          ptrs,
+                                                          sizes,
+                                                          params.cols,
+                                                          train_samples.data(),
+                                                          params.rows,
+                                                          knn_indices.data(),
+                                                          knn_dists.data(),
+                                                          params.k);
 
     std::vector<int*> y;
     y.push_back(train_labels.data());
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index 0e5c3b1c02..840303a43c 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -101,14 +101,14 @@ class KNNRegressionTest : public ::testing::TestWithParam<KNNRegressionInputs> {
     sizes[0] = params.rows;
 
     raft::spatial::knn::brute_force_knn<long, float, int>(handle,
-                                        ptrs,
-                                        sizes,
-                                        params.cols,
-                                        train_samples.data(),
-                                        params.rows,
-                                        knn_indices.data(),
-                                        knn_dists.data(),
-                                        params.k);
+                                                          ptrs,
+                                                          sizes,
+                                                          params.cols,
+                                                          train_samples.data(),
+                                                          params.rows,
+                                                          knn_indices.data(),
+                                                          knn_dists.data(),
+                                                          params.k);
 
     std::vector<float*> y;
     y.push_back(train_labels.data());
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index 5964f7220d..177e55d32a 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -26,10 +26,10 @@
 #include <gtest/gtest.h>
 #include <iostream>
 #include <raft/cudart_utils.h>
+#include <raft/spatial/knn/specializations.hpp>
 #include <stdio.h>
 #include <stdlib.h>
 #include <tsne/distances.cuh>
-#include <raft/spatial/knn/specializations.hpp>
 #include <tsne/tsne_runner.cuh>
 #include <tsne/utils.cuh>
 #include <vector>
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index 6788981215..498db91b1d 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -23,9 +23,9 @@
 #include <cuml/manifold/umapparams.h>
 #include <cuml/metrics/metrics.hpp>
 #include <cuml/neighbors/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
 #include <datasets/digits.h>
 #include <raft/cudart_utils.h>
+#include <raft/spatial/knn/specializations.hpp>
 #include <test_utils.h>
 
 #include <datasets/digits.h>
@@ -144,14 +144,14 @@ class UMAPParametrizableTest : public ::testing::Test {
       sizes[0] = n_samples;
 
       raft::spatial::knn::brute_force_knn<long, float, int>(handle,
-                                          ptrs,
-                                          sizes,
-                                          n_features,
-                                          X,
-                                          n_samples,
-                                          knn_indices,
-                                          knn_dists,
-                                          umap_params.n_neighbors);
+                                                            ptrs,
+                                                            sizes,
+                                                            n_features,
+                                                            X,
+                                                            n_samples,
+                                                            knn_indices,
+                                                            knn_dists,
+                                                            umap_params.n_neighbors);
 
       handle.sync_stream(stream);
     }

From bb474e2925c52f1f741ecfb5b2050be5fbad31ef Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Feb 2022 14:08:23 -0500
Subject: [PATCH 11/38] Fixing copyright and removing a couple more distance
 funcs

---
 cpp/src_prims/matrix/grammatrix.cuh     | 2 ++
 cpp/src_prims/matrix/kernelmatrices.cuh | 1 +
 cpp/test/prims/gram.cu                  | 1 +
 3 files changed, 4 insertions(+)

diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
index bd66f815e2..bc34e9a564 100644
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ b/cpp/src_prims/matrix/grammatrix.cuh
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <raft/distance/distance.hpp>
+#include <raft/distance/specializations.hpp>
+
 // #TODO: Replace with public header when ready
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemm.hpp>
diff --git a/cpp/src_prims/matrix/kernelmatrices.cuh b/cpp/src_prims/matrix/kernelmatrices.cuh
index 6e40dc243a..e1bc3acfa9 100644
--- a/cpp/src_prims/matrix/kernelmatrices.cuh
+++ b/cpp/src_prims/matrix/kernelmatrices.cuh
@@ -19,6 +19,7 @@
 #include "grammatrix.cuh"
 #include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.hpp>
+#include <raft/distance/specializations.hpp>
 #include <raft/linalg/gemm.hpp>
 
 namespace MLCommon {
diff --git a/cpp/test/prims/gram.cu b/cpp/test/prims/gram.cu
index b0c5ae2086..6322171893 100644
--- a/cpp/test/prims/gram.cu
+++ b/cpp/test/prims/gram.cu
@@ -23,6 +23,7 @@
 #include <memory>
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
+#include <raft/distance/specializations.hpp>
 #include <raft/mr/host/allocator.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>

From dbde26756367f3cfa1f87cd7cf8d197b678aa932 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Feb 2022 14:12:27 -0500
Subject: [PATCH 12/38] Removing prims moved over to raft

---
 cpp/test/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 38832131d7..c059ef65e1 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -207,11 +207,9 @@ if(BUILD_PRIMS_TESTS)
   ConfigureTest(PREFIX PRIMS NAME ADD_SUB_DEV_SCALAR_TEST PATH prims/add_sub_dev_scalar.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_CSR_TEST PATH prims/batched/csr.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_GEMV_TEST PATH prims/batched/gemv.cu)
-  ConfigureTest(PREFIX PRIMS NAME BATCHED_INFORMATION_CRIT_TEST PATH prims/batched/information_criterion.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_MAKE_SYMM_TEST PATH prims/batched/make_symm.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_MATRIX_TEST PATH prims/batched/matrix.cu)
   ConfigureTest(PREFIX PRIMS NAME CACHE_TEST PATH prims/cache.cu)
-  ConfigureTest(PREFIX PRIMS NAME COLUMNSORT_TEST PATH prims/columnSort.cu)
   ConfigureTest(PREFIX PRIMS NAME DECOUPLED_LOOKBACK_TEST PATH prims/decoupled_lookback.cu)
   ConfigureTest(PREFIX PRIMS NAME DEVICE_UTILS_TEST PATH prims/device_utils.cu)
   ConfigureTest(PREFIX PRIMS NAME ELTWISE2D_TEST PATH prims/eltwise2d.cu)

From 375142e5a902b035ebf4bdc320bb53d1f18f9913 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 18 Feb 2022 22:34:58 -0500
Subject: [PATCH 13/38] Reverting get_raft cmake

---
 cpp/cmake/thirdparty/get_raft.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 2dca5bb329..63f795d519 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -57,8 +57,8 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}")
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-                        FORK             cjnolet
-                        PINNED_TAG       imp-2204-metrics_from_cuml
+                        FORK             rapidsai
+                        PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
                         USE_RAFT_NN      ${CUML_USE_RAFT_NN}
                         USE_FAISS_STATIC ${CUML_USE_FAISS_STATIC}
                         )

From 8d57a72c3caf5c6ebbbefbc429a38fa41a02ffd1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 19 Feb 2022 07:32:13 -0500
Subject: [PATCH 14/38] Using distance already specialized

---
 cpp/src_prims/matrix/grammatrix.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
index bc34e9a564..1439f55ffe 100644
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ b/cpp/src_prims/matrix/grammatrix.cuh
@@ -214,9 +214,8 @@ class GramMatrixBase {
                         int ld2,
                         int ld_out)
   {
-    auto fin_op = [] __device__(math_t d_val, int idx) { return d_val; };
     raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
-      x1, x2, out, n1, n2, n_cols, NULL, 0, fin_op, stream, is_row_major);
+      x1, x2, out, n1, n2, n_cols, stream, is_row_major);
   }
 };
 };  // end namespace Matrix

From d95746a6950e66c56be16397e316e1e034749232 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 29 Mar 2022 22:37:03 -0400
Subject: [PATCH 15/38] updating get_raft

---
 cpp/cmake/thirdparty/get_raft.cmake | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index d63087a7fb..44c1d2c2f2 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -22,10 +22,7 @@ function(find_and_configure_raft)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN} )
 
-    if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${CUML_BRANCH_VERSION_raft}")
-        message(STATUS "CUML: RAFT pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.")
-        set(CPM_DOWNLOAD_raft ON)
-    endif()
+    set(CPM_DOWNLOAD_raft ON)
 
     if(PKG_USE_RAFT_STATIC)
         message(STATUS "CUML: Cloning raft locally to build static libraries.")

From 3f24aeab5d933a0798dd147bd8b6a6f36bfd5869 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 29 Mar 2022 22:49:22 -0400
Subject: [PATCH 16/38] Fixing includes

---
 cpp/src_prims/metrics/homogeneity_score.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src_prims/metrics/homogeneity_score.cuh b/cpp/src_prims/metrics/homogeneity_score.cuh
index a11b971f02..fdd7c1cd53 100644
--- a/cpp/src_prims/metrics/homogeneity_score.cuh
+++ b/cpp/src_prims/metrics/homogeneity_score.cuh
@@ -20,8 +20,8 @@
  * contain only data points which are members of a single class.
  */
 
-#include "entropy.cuh"
-#include "mutual_info_score.cuh"
+#include <entropy.cuh>
+#include <mutual_info_score.cuh>
 
 namespace MLCommon {
 

From 1ba60f56f12f2e9ae688df717bde2edfc1ff21f6 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 30 Mar 2022 23:18:55 -0400
Subject: [PATCH 17/38] Removing raft allocator from gram

---
 cpp/test/prims/gram.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/test/prims/gram.cu b/cpp/test/prims/gram.cu
index 6322171893..bd4d2b7e49 100644
--- a/cpp/test/prims/gram.cu
+++ b/cpp/test/prims/gram.cu
@@ -24,7 +24,6 @@
 #include <raft/cuda_utils.cuh>
 #include <raft/cudart_utils.h>
 #include <raft/distance/specializations.hpp>
-#include <raft/mr/host/allocator.hpp>
 #include <raft/random/rng.hpp>
 #include <rmm/device_uvector.hpp>
 

From 0e0e69322f41946f565df430dbad1ced52198abc Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 6 Apr 2022 17:09:33 -0400
Subject: [PATCH 18/38] Removing prims benchmarks which are being moved to raft

---
 cpp/bench/CMakeLists.txt             |  15 +---
 cpp/bench/prims/add.cu               |  69 ----------------
 cpp/bench/prims/distance_common.cuh  | 109 ------------------------
 cpp/bench/prims/distance_cosine.cu   |  27 ------
 cpp/bench/prims/distance_exp_l2.cu   |  28 -------
 cpp/bench/prims/distance_l1.cu       |  27 ------
 cpp/bench/prims/distance_unexp_l2.cu |  28 -------
 cpp/bench/prims/fused_l2_nn.cu       | 114 -------------------------
 cpp/bench/prims/make_blobs.cu        |  92 ---------------------
 cpp/bench/prims/map_then_reduce.cu   |  82 ------------------
 cpp/bench/prims/matrix_vector_op.cu  |  96 ---------------------
 cpp/bench/prims/permute.cu           | 104 -----------------------
 cpp/bench/prims/reduce.cu            |  83 -------------------
 cpp/bench/prims/rng.cu               | 119 ---------------------------
 14 files changed, 2 insertions(+), 991 deletions(-)
 delete mode 100644 cpp/bench/prims/add.cu
 delete mode 100644 cpp/bench/prims/distance_common.cuh
 delete mode 100644 cpp/bench/prims/distance_cosine.cu
 delete mode 100644 cpp/bench/prims/distance_exp_l2.cu
 delete mode 100644 cpp/bench/prims/distance_l1.cu
 delete mode 100644 cpp/bench/prims/distance_unexp_l2.cu
 delete mode 100644 cpp/bench/prims/fused_l2_nn.cu
 delete mode 100644 cpp/bench/prims/make_blobs.cu
 delete mode 100644 cpp/bench/prims/map_then_reduce.cu
 delete mode 100644 cpp/bench/prims/matrix_vector_op.cu
 delete mode 100644 cpp/bench/prims/permute.cu
 delete mode 100644 cpp/bench/prims/reduce.cu
 delete mode 100644 cpp/bench/prims/rng.cu

diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 3647f0ed8a..57233b4b5a 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -74,20 +74,9 @@ endif()
 if(BUILD_CUML_PRIMS_BENCH)
   # (please keep the filenames in alphabetical order)
   add_executable(${PRIMS_BENCH_TARGET}
-    prims/add.cu
-    prims/distance_cosine.cu
-    prims/distance_exp_l2.cu
-    prims/distance_l1.cu
-    prims/distance_unexp_l2.cu
-    prims/fused_l2_nn.cu
-    prims/gram_matrix.cu
+          prims/gram_matrix.cu
     prims/main.cpp
-    prims/make_blobs.cu
-    prims/map_then_reduce.cu
-    prims/matrix_vector_op.cu
-    prims/permute.cu
-    prims/reduce.cu
-    prims/rng.cu)
+          )
 
   target_compile_options(${PRIMS_BENCH_TARGET}
         PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUML_CXX_FLAGS}>"
diff --git a/cpp/bench/prims/add.cu b/cpp/bench/prims/add.cu
deleted file mode 100644
index 5a9340cd2f..0000000000
--- a/cpp/bench/prims/add.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/linalg/add.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace LinAlg {
-
-struct AddParams {
-  int len;
-};  // struct AddParams
-
-template <typename T>
-struct AddBench : public Fixture {
-  AddBench(const std::string& name, const AddParams& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    alloc(ptr0, params.len, true);
-    alloc(ptr1, params.len, true);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(ptr0, params.len);
-    dealloc(ptr1, params.len);
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() { raft::linalg::add(ptr0, ptr0, ptr1, params.len, stream); });
-  }
-
- private:
-  AddParams params;
-  T *ptr0, *ptr1;
-};  // struct AddBench
-
-static std::vector<AddParams> getInputs()
-{
-  return {
-    {256 * 1024 * 1024},
-    {256 * 1024 * 1024 + 2},
-    {256 * 1024 * 1024 + 1},
-  };
-}
-
-ML_BENCH_REGISTER(AddParams, AddBench<float>, "", getInputs());
-ML_BENCH_REGISTER(AddParams, AddBench<double>, "", getInputs());
-
-}  // namespace LinAlg
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/distance_common.cuh b/cpp/bench/prims/distance_common.cuh
deleted file mode 100644
index dcc8292f82..0000000000
--- a/cpp/bench/prims/distance_common.cuh
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/cudart_utils.h>
-#include <raft/distance/distance.hpp>
-#include <raft/distance/specializations.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-struct Params {
-  int m, n, k;
-  bool isRowMajor;
-};  // struct Params
-
-template <typename T, raft::distance::DistanceType DType>
-struct Distance : public Fixture {
-  Distance(const std::string& name, const Params& p)
-    : Fixture(name), params(p), x(0, stream), y(0, stream), out(0, stream), workspace(0, stream)
-  {
-  }
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    x.resize(params.m * params.k, stream);
-    y.resize(params.n * params.k, stream);
-    out.resize(params.m * params.n, stream);
-    RAFT_CUDA_TRY(cudaMemsetAsync(x.data(), 0, x.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(y.data(), 0, y.size() * sizeof(T), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(out.data(), 0, out.size() * sizeof(T), stream));
-    worksize = raft::distance::getWorkspaceSize<DType, T, T, T>(
-      x.data(), y.data(), params.m, params.n, params.k);
-    workspace.resize(worksize, stream);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    x.release();
-    y.release();
-    out.release();
-    workspace.release();
-  }
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      raft::distance::distance<DType, T, T, T>(x.data(),
-                                               y.data(),
-                                               out.data(),
-                                               params.m,
-                                               params.n,
-                                               params.k,
-                                               (void*)workspace.data(),
-                                               worksize,
-                                               stream,
-                                               params.isRowMajor);
-    });
-  }
-
- private:
-  Params params;
-  rmm::device_uvector<T> x, y, out;
-  rmm::device_uvector<char> workspace;
-  size_t worksize;
-};  // struct Distance
-
-static std::vector<Params> getInputs()
-{
-  return {
-    {32, 16384, 16384, true},    {64, 16384, 16384, true},     {128, 16384, 16384, true},
-    {256, 16384, 16384, true},   {512, 16384, 16384, true},    {1024, 16384, 16384, true},
-    {16384, 32, 16384, true},    {16384, 64, 16384, true},     {16384, 128, 16384, true},
-    {16384, 256, 16384, true},   {16384, 512, 16384, true},    {16384, 1024, 16384, true},
-    {16384, 16384, 32, true},    {16384, 16384, 64, true},     {16384, 16384, 128, true},
-    {16384, 16384, 256, true},   {16384, 16384, 512, true},    {16384, 16384, 1024, true},
-    {16384, 16384, 16384, true}, {32, 16384, 16384, false},    {64, 16384, 16384, false},
-    {128, 16384, 16384, false},  {256, 16384, 16384, false},   {512, 16384, 16384, false},
-    {1024, 16384, 16384, false}, {16384, 32, 16384, false},    {16384, 64, 16384, false},
-    {16384, 128, 16384, false},  {16384, 256, 16384, false},   {16384, 512, 16384, false},
-    {16384, 1024, 16384, false}, {16384, 16384, 32, false},    {16384, 16384, 64, false},
-    {16384, 16384, 128, false},  {16384, 16384, 256, false},   {16384, 16384, 512, false},
-    {16384, 16384, 1024, false}, {16384, 16384, 16384, false},
-  };
-}
-
-#define DIST_BENCH_REGISTER(Name, Metric)              \
-  using Name##F = Distance<float, Metric>;             \
-  ML_BENCH_REGISTER(Params, Name##F, "", getInputs()); \
-  using Name##D = Distance<double, Metric>;            \
-  ML_BENCH_REGISTER(Params, Name##D, "", getInputs())
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/distance_cosine.cu b/cpp/bench/prims/distance_cosine.cu
deleted file mode 100644
index c3256f25bc..0000000000
--- a/cpp/bench/prims/distance_cosine.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-DIST_BENCH_REGISTER(DistanceCosine, raft::distance::DistanceType::CosineExpanded);
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/distance_exp_l2.cu b/cpp/bench/prims/distance_exp_l2.cu
deleted file mode 100644
index fc4a854b2c..0000000000
--- a/cpp/bench/prims/distance_exp_l2.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-DIST_BENCH_REGISTER(DistanceL2Sq, raft::distance::DistanceType::L2Expanded);
-DIST_BENCH_REGISTER(DistanceL2Sqrt, raft::distance::DistanceType::L2SqrtExpanded);
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/distance_l1.cu b/cpp/bench/prims/distance_l1.cu
deleted file mode 100644
index c14d653a5b..0000000000
--- a/cpp/bench/prims/distance_l1.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-DIST_BENCH_REGISTER(DistanceL1, raft::distance::DistanceType::L1);
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/distance_unexp_l2.cu b/cpp/bench/prims/distance_unexp_l2.cu
deleted file mode 100644
index a26da4fe60..0000000000
--- a/cpp/bench/prims/distance_unexp_l2.cu
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "distance_common.cuh"
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-DIST_BENCH_REGISTER(DistanceUnexpL2Sq, raft::distance::DistanceType::L2Unexpanded);
-DIST_BENCH_REGISTER(DistanceUnexpL2Sqrt, raft::distance::DistanceType::L2SqrtUnexpanded);
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/fused_l2_nn.cu b/cpp/bench/prims/fused_l2_nn.cu
deleted file mode 100644
index c949e119d3..0000000000
--- a/cpp/bench/prims/fused_l2_nn.cu
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <limits>
-#include <raft/cudart_utils.h>
-#include <raft/distance/fused_l2_nn.hpp>
-#include <raft/handle.hpp>
-#include <raft/linalg/norm.hpp>
-#include <raft/random/rng.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace Distance {
-
-struct FLNParams {
-  int m, n, k;
-};  // struct FLNParams
-
-template <typename T>
-struct FusedL2NN : public Fixture {
-  FusedL2NN(const std::string& name, const FLNParams& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    alloc(x, params.m * params.k);
-    alloc(y, params.n * params.k);
-    alloc(xn, params.m);
-    alloc(yn, params.n);
-    alloc(out, params.m);
-    alloc(workspace, params.m);
-    raft::random::Rng r(123456ULL);
-    raft::handle_t handle{stream};
-
-    r.uniform(x, params.m * params.k, T(-1.0), T(1.0), stream);
-    r.uniform(y, params.n * params.k, T(-1.0), T(1.0), stream);
-    raft::linalg::rowNorm(xn, x, params.k, params.m, raft::linalg::L2Norm, true, stream);
-    raft::linalg::rowNorm(yn, y, params.k, params.n, raft::linalg::L2Norm, true, stream);
-    raft::distance::initialize<T, cub::KeyValuePair<int, T>, int>(
-      handle, out, params.m, std::numeric_limits<T>::max(), op);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(x, params.m * params.k);
-    dealloc(y, params.n * params.k);
-    dealloc(xn, params.m);
-    dealloc(yn, params.n);
-    dealloc(out, params.m);
-    dealloc(workspace, params.m);
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      // it is enough to only benchmark the L2-squared metric
-      raft::distance::fusedL2NN<T, cub::KeyValuePair<int, T>, int>(out,
-                                                                   x,
-                                                                   y,
-                                                                   xn,
-                                                                   yn,
-                                                                   params.m,
-                                                                   params.n,
-                                                                   params.k,
-                                                                   (void*)workspace,
-                                                                   op,
-                                                                   pairRedOp,
-                                                                   false,
-                                                                   false,
-                                                                   stream);
-    });
-  }
-
- private:
-  FLNParams params;
-  T *x, *y, *xn, *yn;
-  cub::KeyValuePair<int, T>* out;
-  int* workspace;
-  raft::distance::KVPMinReduce<int, T> pairRedOp;
-  raft::distance::MinAndDistanceReduceOp<int, T> op;
-};  // struct FusedL2NN
-
-static std::vector<FLNParams> getInputs()
-{
-  return {
-    {32, 16384, 16384},  {64, 16384, 16384},   {128, 16384, 16384},   {256, 16384, 16384},
-    {512, 16384, 16384}, {1024, 16384, 16384}, {16384, 32, 16384},    {16384, 64, 16384},
-    {16384, 128, 16384}, {16384, 256, 16384},  {16384, 512, 16384},   {16384, 1024, 16384},
-    {16384, 16384, 32},  {16384, 16384, 64},   {16384, 16384, 128},   {16384, 16384, 256},
-    {16384, 16384, 512}, {16384, 16384, 1024}, {16384, 16384, 16384},
-  };
-}
-
-ML_BENCH_REGISTER(FLNParams, FusedL2NN<float>, "", getInputs());
-ML_BENCH_REGISTER(FLNParams, FusedL2NN<double>, "", getInputs());
-
-}  // namespace Distance
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/make_blobs.cu b/cpp/bench/prims/make_blobs.cu
deleted file mode 100644
index 963679d77c..0000000000
--- a/cpp/bench/prims/make_blobs.cu
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/random/make_blobs.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace Random {
-
-struct Params {
-  int rows, cols, clusters;
-  bool row_major;
-};  // struct Params
-
-template <typename T>
-struct MakeBlobs : public Fixture {
-  MakeBlobs(const std::string& name, const Params& p)
-    : Fixture(name), params(p), data(0, stream), labels(0, stream)
-  {
-  }
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    data.resize(params.rows * params.cols, stream);
-    labels.resize(params.rows, stream);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    data.release();
-    labels.release();
-  }
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      raft::random::make_blobs(data.data(),
-                               labels.data(),
-                               params.rows,
-                               params.cols,
-                               params.clusters,
-                               this->stream,
-                               params.row_major);
-    });
-  }
-
- private:
-  Params params;
-  rmm::device_uvector<T> data;
-  rmm::device_uvector<int> labels;
-};  // struct MakeBlobs
-
-static std::vector<Params> getInputs()
-{
-  std::vector<Params> out;
-  Params p;
-  for (auto rows : std::vector<int>{100000, 1000000}) {
-    for (auto cols : std::vector<int>{10, 100}) {
-      for (auto clusters : std::vector<int>{2, 10, 100}) {
-        p.rows      = rows;
-        p.cols      = cols;
-        p.clusters  = clusters;
-        p.row_major = true;
-        out.push_back(p);
-        p.row_major = false;
-        out.push_back(p);
-      }
-    }
-  }
-  return out;
-}
-
-ML_BENCH_REGISTER(Params, MakeBlobs<float>, "", getInputs());
-ML_BENCH_REGISTER(Params, MakeBlobs<double>, "", getInputs());
-
-}  // namespace Random
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/map_then_reduce.cu b/cpp/bench/prims/map_then_reduce.cu
deleted file mode 100644
index 0520562f7b..0000000000
--- a/cpp/bench/prims/map_then_reduce.cu
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/linalg/map_then_reduce.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace LinAlg {
-
-struct Params {
-  int len;
-};
-
-template <typename Type>
-struct Identity {
-  HDI Type operator()(Type a) { return a; }
-};
-
-template <typename T>
-struct MapThenReduce : public Fixture {
-  MapThenReduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    alloc(in, params.len, true);
-    alloc(out, 1, true);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(in, params.len);
-    dealloc(out, 1);
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      raft::linalg::mapThenSumReduce(out, params.len, Identity<T>(), stream, in);
-    });
-  }
-
- private:
-  Params params;
-  T *out, *in;
-};  // struct MapThenReduce
-
-static std::vector<Params> getInputs()
-{
-  return {
-    {1024 * 1024},
-    {32 * 1024 * 1024},
-    {1024 * 1024 * 1024},
-    {1024 * 1024 + 2},
-    {32 * 1024 * 1024 + 2},
-    {1024 * 1024 * 1024 + 2},
-    {1024 * 1024 + 1},
-    {32 * 1024 * 1024 + 1},
-    {1024 * 1024 * 1024 + 1},
-  };
-}
-
-ML_BENCH_REGISTER(Params, MapThenReduce<float>, "", getInputs());
-ML_BENCH_REGISTER(Params, MapThenReduce<double>, "", getInputs());
-
-}  // namespace LinAlg
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/matrix_vector_op.cu b/cpp/bench/prims/matrix_vector_op.cu
deleted file mode 100644
index e117d96bb2..0000000000
--- a/cpp/bench/prims/matrix_vector_op.cu
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace LinAlg {
-
-struct Params {
-  int rows, cols;
-  bool rowMajor, bcastAlongRows;
-};  // struct Params
-
-template <typename T>
-struct MatVecOp : public Fixture {
-  MatVecOp(const std::string& name, const Params& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    alloc(out, params.rows * params.cols, true);
-    alloc(in, params.rows * params.cols, true);
-    auto vecLen = params.bcastAlongRows ? params.cols : params.rows;
-    alloc(vec, vecLen, true);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(out, params.rows * params.cols);
-    dealloc(in, params.rows * params.cols);
-    auto vecLen = params.bcastAlongRows ? params.cols : params.rows;
-    dealloc(vec, vecLen);
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      raft::linalg::matrixVectorOp(out,
-                                   in,
-                                   vec,
-                                   params.cols,
-                                   params.rows,
-                                   params.rowMajor,
-                                   params.bcastAlongRows,
-                                   raft::Sum<T>(),
-                                   stream);
-    });
-  }
-
- private:
-  Params params;
-  T *out, *in, *vec;
-};  // struct MatVecOp
-
-static std::vector<Params> getInputs()
-{
-  return {
-    {1024, 128, true, true},       {1024 * 1024, 128, true, true},
-    {1024, 128 + 2, true, true},   {1024 * 1024, 128 + 2, true, true},
-    {1024, 128 + 1, true, true},   {1024 * 1024, 128 + 1, true, true},
-
-    {1024, 128, true, false},      {1024 * 1024, 128, true, false},
-    {1024, 128 + 2, true, false},  {1024 * 1024, 128 + 2, true, false},
-    {1024, 128 + 1, true, false},  {1024 * 1024, 128 + 1, true, false},
-
-    {1024, 128, false, false},     {1024 * 1024, 128, false, false},
-    {1024, 128 + 2, false, false}, {1024 * 1024, 128 + 2, false, false},
-    {1024, 128 + 1, false, false}, {1024 * 1024, 128 + 1, false, false},
-
-    {1024, 128, false, true},      {1024 * 1024, 128, false, true},
-    {1024, 128 + 2, false, true},  {1024 * 1024, 128 + 2, false, true},
-    {1024, 128 + 1, false, true},  {1024 * 1024, 128 + 1, false, true},
-  };
-}
-
-ML_BENCH_REGISTER(Params, MatVecOp<float>, "", getInputs());
-ML_BENCH_REGISTER(Params, MatVecOp<double>, "", getInputs());
-
-}  // namespace LinAlg
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/permute.cu b/cpp/bench/prims/permute.cu
deleted file mode 100644
index 9f6c6d40a7..0000000000
--- a/cpp/bench/prims/permute.cu
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/cudart_utils.h>
-#include <raft/random/permute.hpp>
-#include <raft/random/rng.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace Random {
-
-struct Params {
-  int rows, cols;
-  bool needPerms, needShuffle, rowMajor;
-};  // struct Params
-
-template <typename T>
-struct Permute : public Fixture {
-  Permute(const std::string& name, const Params& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    auto matLen = params.rows * params.cols;
-    auto vecLen = params.rows;
-    if (params.needPerms) {
-      alloc(perms, vecLen);
-    } else {
-      perms = nullptr;
-    }
-    raft::random::Rng r(123456ULL);
-    if (params.needShuffle) {
-      alloc(out, matLen);
-      alloc(in, matLen);
-      r.uniform(in, vecLen, T(-1.0), T(1.0), stream);
-    } else {
-      out = in = nullptr;
-    }
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    auto matLen = params.rows * params.cols;
-    auto vecLen = params.rows;
-    if (params.needShuffle) {
-      dealloc(out, matLen);
-      dealloc(in, matLen);
-    }
-    if (params.needPerms) { dealloc(perms, vecLen); }
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    raft::random::Rng r(123456ULL);
-    loopOnState(state, [this, &r]() {
-      raft::random::permute(perms, out, in, params.cols, params.rows, params.rowMajor, stream);
-    });
-  }
-
- private:
-  Params params;
-  T *out, *in;
-  int* perms;
-};  // struct Permute
-
-static std::vector<Params> getInputs()
-{
-  return {
-    {32 * 1024, 128, true, true, true},
-    {1024 * 1024, 128, true, true, true},
-    {32 * 1024, 128 + 2, true, true, true},
-    {1024 * 1024, 128 + 2, true, true, true},
-    {32 * 1024, 128 + 1, true, true, true},
-    {1024 * 1024, 128 + 1, true, true, true},
-
-    {32 * 1024, 128, true, true, false},
-    {1024 * 1024, 128, true, true, false},
-    {32 * 1024, 128 + 2, true, true, false},
-    {1024 * 1024, 128 + 2, true, true, false},
-    {32 * 1024, 128 + 1, true, true, false},
-    {1024 * 1024, 128 + 1, true, true, false},
-  };
-}
-
-ML_BENCH_REGISTER(Params, Permute<float>, "", getInputs());
-ML_BENCH_REGISTER(Params, Permute<double>, "", getInputs());
-
-}  // namespace Random
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/reduce.cu b/cpp/bench/prims/reduce.cu
deleted file mode 100644
index bdfe17c62d..0000000000
--- a/cpp/bench/prims/reduce.cu
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/linalg/reduce.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace LinAlg {
-
-struct Params {
-  int rows, cols;
-  bool alongRows;
-};  // struct Params
-
-template <typename T>
-struct Reduce : public Fixture {
-  Reduce(const std::string& name, const Params& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    alloc(data, params.rows * params.cols, true);
-    alloc(dots, params.rows, true);
-  }
-
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    dealloc(data, params.rows * params.cols);
-    dealloc(dots, params.rows);
-  }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    loopOnState(state, [this]() {
-      raft::linalg::reduce(
-        dots, data, params.cols, params.rows, T(0.f), true, params.alongRows, stream);
-    });
-  }
-
- private:
-  Params params;
-  T *data, *dots;
-};  // struct Reduce
-
-static std::vector<Params> getInputs()
-{
-  return {
-    {8 * 1024, 1024, false},
-    {1024, 8 * 1024, false},
-    {8 * 1024, 8 * 1024, false},
-    {32 * 1024, 1024, false},
-    {1024, 32 * 1024, false},
-    {32 * 1024, 32 * 1024, false},
-
-    {8 * 1024, 1024, true},
-    {1024, 8 * 1024, true},
-    {8 * 1024, 8 * 1024, true},
-    {32 * 1024, 1024, true},
-    {1024, 32 * 1024, true},
-    {32 * 1024, 32 * 1024, true},
-  };
-}
-
-ML_BENCH_REGISTER(Params, Reduce<float>, "", getInputs());
-ML_BENCH_REGISTER(Params, Reduce<double>, "", getInputs());
-
-}  // namespace LinAlg
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/rng.cu b/cpp/bench/prims/rng.cu
deleted file mode 100644
index aca727eb1b..0000000000
--- a/cpp/bench/prims/rng.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <raft/cudart_utils.h>
-#include <raft/random/rng.hpp>
-
-namespace MLCommon {
-namespace Bench {
-namespace Random {
-
-enum RandomType {
-  RNG_Normal,
-  RNG_LogNormal,
-  RNG_Uniform,
-  RNG_Gumbel,
-  RNG_Logistic,
-  RNG_Exp,
-  RNG_Rayleigh,
-  RNG_Laplace,
-  RNG_Fill
-};  // enum RandomType
-
-template <typename T>
-struct Params {
-  int len;
-  RandomType type;
-  raft::random::GeneratorType gtype;
-  T start, end;
-};  // struct Params
-
-template <typename T>
-struct RngBench : public Fixture {
-  RngBench(const std::string& name, const Params<T>& p) : Fixture(name), params(p) {}
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override { alloc(ptr, params.len); }
-
-  void deallocateBuffers(const ::benchmark::State& state) override { dealloc(ptr, params.len); }
-
-  void runBenchmark(::benchmark::State& state) override
-  {
-    raft::random::Rng r(123456ULL, params.gtype);
-    loopOnState(state, [this, &r]() {
-      switch (params.type) {
-        case RNG_Normal: r.normal(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_LogNormal: r.lognormal(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_Uniform: r.uniform(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_Gumbel: r.gumbel(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_Logistic: r.logistic(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_Exp: r.exponential(ptr, params.len, params.start, stream); break;
-        case RNG_Rayleigh: r.rayleigh(ptr, params.len, params.start, stream); break;
-        case RNG_Laplace: r.laplace(ptr, params.len, params.start, params.end, stream); break;
-        case RNG_Fill: r.fill(ptr, params.len, params.start, stream); break;
-      };
-    });
-  }
-
- private:
-  Params<T> params;
-  T* ptr;
-};  // struct RngBench
-
-template <typename T>
-static std::vector<Params<T>> getInputs()
-{
-  using namespace raft::random;
-  return {
-    {1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPhilox, T(-1.0), T(1.0)},
-
-    {1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 2, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 1, RNG_Uniform, GenPC, T(-1.0), T(1.0)},
-
-    {1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 2, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {32 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-    {1024 * 1024 * 1024 + 1, RNG_Fill, GenPhilox, T(-1.0), T(1.0)},
-  };
-}
-
-ML_BENCH_REGISTER(Params<float>, RngBench<float>, "", getInputs<float>());
-ML_BENCH_REGISTER(Params<double>, RngBench<double>, "", getInputs<double>());
-
-}  // namespace Random
-}  // namespace Bench
-}  // namespace MLCommon

From bfeeafba29510bd1871b5fcae79a92a721d72b75 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 6 Apr 2022 19:16:02 -0400
Subject: [PATCH 19/38] Fixing completeness score

---
 cpp/src/metrics/completeness_score.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu
index c10caa4892..167dc41cf8 100644
--- a/cpp/src/metrics/completeness_score.cu
+++ b/cpp/src/metrics/completeness_score.cu
@@ -30,7 +30,7 @@ double completeness_score(const raft::handle_t& handle,
                           const int upper_class_range)
 {
   return raft::stats::completeness_score(
-    y_hat, y, n, lower_class_range, upper_class_range, handle.get_stream());
+    y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 
 }  // namespace Metrics

From 9985e69da8a394892c031267b3c820e3def3fdd1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 6 Apr 2022 19:29:56 -0400
Subject: [PATCH 20/38] Fixing completeness score for now

---
 cpp/src/metrics/completeness_score.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu
index 167dc41cf8..bcd2a8900e 100644
--- a/cpp/src/metrics/completeness_score.cu
+++ b/cpp/src/metrics/completeness_score.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <raft/stats/completeness_score.hpp>
+#include <raft/stats/homogeneity_score.cuh>
 
 namespace ML {
 
@@ -29,8 +29,8 @@ double completeness_score(const raft::handle_t& handle,
                           const int lower_class_range,
                           const int upper_class_range)
 {
-  return raft::stats::completeness_score(
-    y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
+  return raft::stats::homogeneity_score(
+    y_hat, y, n, lower_class_range, upper_class_range, handle.get_stream());
 }
 
 }  // namespace Metrics

From 8663dc984bbfca92412e70277ceb316758d8b08e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Thu, 7 Apr 2022 19:50:46 -0400
Subject: [PATCH 21/38] Fixing get_raft.cmake

---
 cpp/cmake/thirdparty/get_raft.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 44c1d2c2f2..d63087a7fb 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -22,7 +22,10 @@ function(find_and_configure_raft)
     cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
             "${multiValueArgs}" ${ARGN} )
 
-    set(CPM_DOWNLOAD_raft ON)
+    if(PKG_CLONE_ON_PIN AND NOT PKG_PINNED_TAG STREQUAL "branch-${CUML_BRANCH_VERSION_raft}")
+        message(STATUS "CUML: RAFT pinned tag found: ${PKG_PINNED_TAG}. Cloning raft locally.")
+        set(CPM_DOWNLOAD_raft ON)
+    endif()
 
     if(PKG_USE_RAFT_STATIC)
         message(STATUS "CUML: Cloning raft locally to build static libraries.")

From 352e6fb8f32e77a9780f0e221cca7ff20a57c043 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 3 Oct 2022 12:51:52 -0400
Subject: [PATCH 22/38] emoving more files

---
 .../metrics/batched/information_criterion.cuh |  85 ------
 .../metrics/batched/silhouette_score.cuh      | 280 ------------------
 cpp/src_prims/metrics/homogeneity_score.cuh   |  69 -----
 3 files changed, 434 deletions(-)
 delete mode 100644 cpp/src_prims/metrics/batched/information_criterion.cuh
 delete mode 100644 cpp/src_prims/metrics/batched/silhouette_score.cuh
 delete mode 100644 cpp/src_prims/metrics/homogeneity_score.cuh

diff --git a/cpp/src_prims/metrics/batched/information_criterion.cuh b/cpp/src_prims/metrics/batched/information_criterion.cuh
deleted file mode 100644
index d1907538eb..0000000000
--- a/cpp/src_prims/metrics/batched/information_criterion.cuh
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file information_criterion.cuh
- * @brief These information criteria are used to evaluate the quality of models
- *        by balancing the quality of the fit and the number of parameters.
- *
- * See:
- *  - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion
- *  - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc
- *  - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion
- */
-
-#include <raft/linalg/unary_op.cuh>
-
-#include <cmath>
-
-namespace MLCommon {
-namespace Metrics {
-
-/// Supported types of information criteria
-enum IC_Type { AIC, AICc, BIC };
-
-namespace Batched {
-
-/**
- * Compute the given type of information criterion
- *
- * @note: it is safe to do the computation in-place (i.e give same pointer
- *        as input and output)
- *
- * @param[out] d_ic             Information criterion to be returned for each
- *                              series (device)
- * @param[in]  d_loglikelihood  Log-likelihood for each series (device)
- * @param[in]  ic_type          Type of criterion to compute. See IC_Type
- * @param[in]  n_params         Number of parameters in the model
- * @param[in]  batch_size       Number of series in the batch
- * @param[in]  n_samples        Number of samples in each series
- * @param[in]  stream           CUDA stream
- */
-template <typename ScalarT, typename IdxT>
-void information_criterion(ScalarT* d_ic,
-                           const ScalarT* d_loglikelihood,
-                           IC_Type ic_type,
-                           IdxT n_params,
-                           IdxT batch_size,
-                           IdxT n_samples,
-                           cudaStream_t stream)
-{
-  ScalarT ic_base{};
-  ScalarT N = static_cast<ScalarT>(n_params);
-  ScalarT T = static_cast<ScalarT>(n_samples);
-  switch (ic_type) {
-    case AIC: ic_base = (ScalarT)2.0 * N; break;
-    case AICc:
-      ic_base = (ScalarT)2.0 * (N + (N * (N + (ScalarT)1.0)) / (T - N - (ScalarT)1.0));
-      break;
-    case BIC: ic_base = std::log(T) * N; break;
-  }
-  /* Compute information criterion from log-likelihood and base term */
-  raft::linalg::unaryOp(
-    d_ic,
-    d_loglikelihood,
-    batch_size,
-    [=] __device__(ScalarT loglike) { return ic_base - (ScalarT)2.0 * loglike; },
-    stream);
-}
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/batched/silhouette_score.cuh b/cpp/src_prims/metrics/batched/silhouette_score.cuh
deleted file mode 100644
index 79898a2f74..0000000000
--- a/cpp/src_prims/metrics/batched/silhouette_score.cuh
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "../silhouette_score.cuh"
-#include <cuml/metrics/metrics.hpp>
-
-#include <raft/cuda_utils.cuh>
-#include <raft/device_atomics.cuh>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <thrust/device_vector.h>
-#include <thrust/fill.h>
-#include <thrust/reduce.h>
-
-namespace MLCommon {
-namespace Metrics {
-namespace Batched {
-
-namespace detail {
-
-/**
- * This kernel initializes matrix b (n_rows * n_labels)
- * For each label that the corresponding row is not a part of is initialized as 0
- * If the corresponding row is the only sample in its label, again 0
- * Only if the there are > 1 samples in the label, row is initialized to max
- */
-template <typename value_t, typename value_idx, typename label_idx>
-__global__ void fill_b_kernel(value_t* b,
-                              const label_idx* y,
-                              value_idx n_rows,
-                              label_idx n_labels,
-                              const value_idx* cluster_counts)
-{
-  value_idx idx = threadIdx.x + blockIdx.x * blockDim.x;
-  label_idx idy = threadIdx.y + blockIdx.y * blockDim.y;
-
-  if (idx >= n_rows || idy >= n_labels) { return; }
-
-  auto row_cluster = y[idx];
-
-  auto col_cluster_count = cluster_counts[idy];
-
-  // b for own cluster should be max value
-  // so that it does not interfere with min operator
-  // b is also max if col cluster count is 0
-  // however, b is 0 if self cluster count is 1
-  if (row_cluster == idy || col_cluster_count == 0) {
-    if (cluster_counts[row_cluster] == 1) {
-      b[idx * n_labels + idy] = 0;
-    } else {
-      b[idx * n_labels + idy] = std::numeric_limits<value_t>::max();
-    }
-  } else {
-    b[idx * n_labels + idy] = 0;
-  }
-}
-
-/**
- * This kernel does an elementwise sweep of chunked pairwise distance matrix
- * By knowing the offsets of the chunked pairwise distance matrix in the
- * global pairwise distance matrix, we are able to calculate
- * intermediate values of a and b for the rows and columns present in the
- * current chunked pairwise distance matrix.
- */
-template <typename value_t, typename value_idx, typename label_idx>
-__global__ void compute_chunked_a_b_kernel(value_t* a,
-                                           value_t* b,
-                                           value_idx row_offset,
-                                           value_idx col_offset,
-                                           const label_idx* y,
-                                           label_idx n_labels,
-                                           const value_idx* cluster_counts,
-                                           const value_t* distances,
-                                           value_idx dist_rows,
-                                           value_idx dist_cols)
-{
-  value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x;
-  value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y;
-
-  // these are global offsets of current element
-  // in the full pairwise distance matrix
-  value_idx pw_row_id = row_id + row_offset;
-  value_idx pw_col_id = col_id + col_offset;
-
-  if (row_id >= dist_rows || col_id >= dist_cols || pw_row_id == pw_col_id) { return; }
-
-  auto row_cluster = y[pw_row_id];
-  if (cluster_counts[row_cluster] == 1) { return; }
-
-  auto col_cluster        = y[pw_col_id];
-  auto col_cluster_counts = cluster_counts[col_cluster];
-
-  if (col_cluster == row_cluster) {
-    atomicAdd(&a[pw_row_id], distances[row_id * dist_cols + col_id] / (col_cluster_counts - 1));
-  } else {
-    atomicAdd(&b[pw_row_id * n_labels + col_cluster],
-              distances[row_id * dist_cols + col_id] / col_cluster_counts);
-  }
-}
-
-}  // namespace detail
-
-template <typename value_idx, typename label_idx>
-rmm::device_uvector<value_idx> get_cluster_counts(const raft::handle_t& handle,
-                                                  label_idx* y,
-                                                  value_idx& n_rows,
-                                                  label_idx& n_labels)
-{
-  auto stream = handle.get_stream();
-
-  rmm::device_uvector<value_idx> cluster_counts(n_labels, stream);
-
-  rmm::device_uvector<char> workspace(1, stream);
-
-  MLCommon::Metrics::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream);
-
-  return cluster_counts;
-}
-
-template <typename value_t, typename value_idx>
-rmm::device_uvector<value_t> get_pairwise_distance(const raft::handle_t& handle,
-                                                   value_t* left_begin,
-                                                   value_t* right_begin,
-                                                   value_idx& n_left_rows,
-                                                   value_idx& n_right_rows,
-                                                   value_idx& n_cols,
-                                                   raft::distance::DistanceType metric,
-                                                   cudaStream_t stream)
-{
-  rmm::device_uvector<value_t> distances(n_left_rows * n_right_rows, stream);
-
-  ML::Metrics::pairwise_distance(
-    handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric);
-
-  return distances;
-}
-
-template <typename value_t, typename value_idx, typename label_idx>
-void compute_chunked_a_b(const raft::handle_t& handle,
-                         value_t* a,
-                         value_t* b,
-                         value_idx& row_offset,
-                         value_idx& col_offset,
-                         const label_idx* y,
-                         label_idx& n_labels,
-                         const value_idx* cluster_counts,
-                         const value_t* distances,
-                         value_idx& dist_rows,
-                         value_idx& dist_cols,
-                         cudaStream_t stream)
-{
-  dim3 block_size(std::min(dist_rows, 32), std::min(dist_cols, 32));
-  dim3 grid_size(raft::ceildiv(dist_rows, (value_idx)block_size.x),
-                 raft::ceildiv(dist_cols, (value_idx)block_size.y));
-
-  detail::compute_chunked_a_b_kernel<<<grid_size, block_size, 0, stream>>>(
-    a, b, row_offset, col_offset, y, n_labels, cluster_counts, distances, dist_rows, dist_cols);
-}
-
-template <typename value_t, typename value_idx, typename label_idx>
-value_t silhouette_score(
-  const raft::handle_t& handle,
-  value_t* X,
-  value_idx n_rows,
-  value_idx n_cols,
-  label_idx* y,
-  label_idx n_labels,
-  value_t* scores,
-  value_idx chunk,
-  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded)
-{
-  ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1),
-         "silhouette Score not defined for the given number of labels!");
-
-  rmm::device_uvector<value_idx> cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels);
-
-  auto stream = handle.get_stream();
-  auto policy = handle.get_thrust_policy();
-
-  auto b_size = n_rows * n_labels;
-
-  value_t *a_ptr, *b_ptr;
-  rmm::device_uvector<value_t> a(0, stream);
-  rmm::device_uvector<value_t> b(b_size, stream);
-
-  b_ptr = b.data();
-
-  // since a and silhouette score per sample are same size, reusing
-  if (scores == nullptr || scores == NULL) {
-    a.resize(n_rows, stream);
-    a_ptr = a.data();
-  } else {
-    a_ptr = scores;
-  }
-
-  thrust::fill(policy, a_ptr, a_ptr + n_rows, 0);
-
-  dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32));
-  dim3 grid_size(raft::ceildiv(n_rows, (value_idx)block_size.x),
-                 raft::ceildiv(n_labels, (label_idx)block_size.y));
-  detail::fill_b_kernel<<<grid_size, block_size, 0, stream>>>(
-    b_ptr, y, n_rows, n_labels, cluster_counts.data());
-
-  handle.wait_stream_pool_on_stream();
-
-  auto n_iters = 0;
-
-  for (value_idx i = 0; i < n_rows; i += chunk) {
-    for (value_idx j = 0; j < n_rows; j += chunk) {
-      ++n_iters;
-
-      auto chunk_stream = handle.get_next_usable_stream(i + chunk * j);
-
-      auto* left_begin  = X + (i * n_cols);
-      auto* right_begin = X + (j * n_cols);
-
-      auto n_left_rows  = (i + chunk) < n_rows ? chunk : (n_rows - i);
-      auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j);
-
-      rmm::device_uvector<value_t> distances = get_pairwise_distance(
-        handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream);
-
-      compute_chunked_a_b(handle,
-                          a_ptr,
-                          b_ptr,
-                          i,
-                          j,
-                          y,
-                          n_labels,
-                          cluster_counts.data(),
-                          distances.data(),
-                          n_left_rows,
-                          n_right_rows,
-                          chunk_stream);
-    }
-  }
-
-  handle.sync_stream_pool();
-
-  // calculating row-wise minimum in b
-  // this prim only supports int indices for now
-  raft::linalg::
-    reduce<value_t, value_t, value_idx, raft::Nop<value_t>, MLCommon::Metrics::MinOp<value_t>>(
-      b_ptr,
-      b_ptr,
-      n_labels,
-      n_rows,
-      std::numeric_limits<value_t>::max(),
-      true,
-      true,
-      stream,
-      false,
-      raft::Nop<value_t>(),
-      MLCommon::Metrics::MinOp<value_t>());
-
-  // calculating the silhouette score per sample
-  raft::linalg::binaryOp<value_t, MLCommon::Metrics::SilOp<value_t>, value_t, value_idx>(
-    a_ptr, a_ptr, b_ptr, n_rows, MLCommon::Metrics::SilOp<value_t>(), stream);
-
-  return thrust::reduce(policy, a_ptr, a_ptr + n_rows, value_t(0)) / n_rows;
-}
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon
diff --git a/cpp/src_prims/metrics/homogeneity_score.cuh b/cpp/src_prims/metrics/homogeneity_score.cuh
deleted file mode 100644
index fdd7c1cd53..0000000000
--- a/cpp/src_prims/metrics/homogeneity_score.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/**
- * @file homogeneity_score.cuh
- *
- * @brief A clustering result satisfies homogeneity if all of its clusters
- * contain only data points which are members of a single class.
- */
-
-#include <entropy.cuh>
-#include <mutual_info_score.cuh>
-
-namespace MLCommon {
-
-namespace Metrics {
-
-/**
- * @brief Function to calculate the homogeneity score between two clusters
- * <a href="https://en.wikipedia.org/wiki/Homogeneity_(statistics)">more info on mutual
- * information</a>
- * @param truthClusterArray: the array of truth classes of type T
- * @param predClusterArray: the array of predicted classes of type T
- * @param size: the size of the data points of type int
- * @param lowerLabelRange: the lower bound of the range of labels
- * @param upperLabelRange: the upper bound of the range of labels
- * @param stream: the cudaStream object
- */
-template <typename T>
-double homogeneity_score(const T* truthClusterArray,
-                         const T* predClusterArray,
-                         int size,
-                         T lowerLabelRange,
-                         T upperLabelRange,
-                         cudaStream_t stream)
-{
-  if (size == 0) return 1.0;
-
-  double computedMI, computedEntropy;
-
-  computedMI = MLCommon::Metrics::mutual_info_score(
-    truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-  computedEntropy =
-    MLCommon::Metrics::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream);
-
-  double homogeneity;
-
-  if (computedEntropy) {
-    homogeneity = computedMI / computedEntropy;
-  } else
-    homogeneity = 1.0;
-
-  return homogeneity;
-}
-
-};  // end namespace Metrics
-};  // end namespace MLCommon

From 873e27765573bb3a3bf38cd7fa4f5cf1dec7791e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 3 Oct 2022 12:52:39 -0400
Subject: [PATCH 23/38] Fixing style

---
 cpp/src/arima/batched_arima.cu          | 1768 ++++++++++++-----------
 cpp/src/hdbscan/detail/reachability.cuh |  398 ++---
 cpp/src/knn/knn_opg_common.cuh          | 1719 +++++++++++-----------
 cpp/src/knn/knn_sparse.cu               |   93 +-
 cpp/src/metrics/trustworthiness.cu      |   81 +-
 cpp/src/randomforest/randomforest.cuh   |  505 +++----
 cpp/src/tsne/tsne.cu                    |  100 +-
 cpp/src/umap/knn_graph/algo.cuh         |    5 +-
 cpp/src_prims/matrix/grammatrix.cuh     |    2 +-
 9 files changed, 2347 insertions(+), 2324 deletions(-)

diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index 1f8cd63a7f..d1587b838a 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -50,531 +50,513 @@
 #include <timeSeries/arima_helpers.cuh>
 #include <timeSeries/fillna.cuh>
 
-namespace ML {
-
-void pack(raft::handle_t& handle,
-          const ARIMAParams<double>& params,
-          const ARIMAOrder& order,
-          int batch_size,
-          double* param_vec)
+namespace ML
 {
-  const auto stream = handle.get_stream();
-  params.pack(order, batch_size, param_vec, stream);
-}
-
-void unpack(raft::handle_t& handle,
-            ARIMAParams<double>& params,
+  void pack(raft::handle_t & handle,
+            const ARIMAParams<double>& params,
             const ARIMAOrder& order,
             int batch_size,
-            const double* param_vec)
-{
-  const auto stream = handle.get_stream();
-  params.unpack(order, batch_size, param_vec, stream);
-}
-
-void batched_diff(raft::handle_t& handle,
-                  double* d_y_diff,
-                  const double* d_y,
-                  int batch_size,
-                  int n_obs,
-                  const ARIMAOrder& order)
-{
-  const auto stream = handle.get_stream();
-  MLCommon::TimeSeries::prepare_data(
-    d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
-}
-
-template <typename T>
-struct is_missing {
-  typedef T argument_type;
-  typedef T result_type;
+            double* param_vec)
+  {
+    const auto stream = handle.get_stream();
+    params.pack(order, batch_size, param_vec, stream);
+  }
 
-  __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const { return isnan(x); }
-};  // end is_missing
+  void unpack(raft::handle_t & handle,
+              ARIMAParams<double> & params,
+              const ARIMAOrder& order,
+              int batch_size,
+              const double* param_vec)
+  {
+    const auto stream = handle.get_stream();
+    params.unpack(order, batch_size, param_vec, stream);
+  }
 
-bool detect_missing(raft::handle_t& handle, const double* d_y, int n_elem)
-{
-  return thrust::any_of(
-    thrust::cuda::par.on(handle.get_stream()), d_y, d_y + n_elem, is_missing<double>());
-}
-
-void predict(raft::handle_t& handle,
-             const ARIMAMemory<double>& arima_mem,
-             const double* d_y,
-             const double* d_exog,
-             const double* d_exog_fut,
-             int batch_size,
-             int n_obs,
-             int start,
-             int end,
-             const ARIMAOrder& order,
-             const ARIMAParams<double>& params,
-             double* d_y_p,
-             bool pre_diff,
-             double level,
-             double* d_lower,
-             double* d_upper)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  const auto stream = handle.get_stream();
-
-  bool diff     = order.need_diff() && pre_diff && level == 0;
-  int num_steps = std::max(end - n_obs, 0);
-
-  // Prepare data
-  int n_obs_kf;
-  const double* d_y_kf;
-  const double* d_exog_kf;
-  const double* d_exog_fut_kf = d_exog_fut;
-  ARIMAOrder order_after_prep = order;
-  rmm::device_uvector<double> exog_fut_buffer(0, stream);
-  if (diff) {
-    n_obs_kf = n_obs - order.n_diff();
+  void batched_diff(raft::handle_t & handle,
+                    double* d_y_diff,
+                    const double* d_y,
+                    int batch_size,
+                    int n_obs,
+                    const ARIMAOrder& order)
+  {
+    const auto stream = handle.get_stream();
     MLCommon::TimeSeries::prepare_data(
-      arima_mem.y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
-    if (order.n_exog > 0) {
-      MLCommon::TimeSeries::prepare_data(arima_mem.exog_diff,
-                                         d_exog,
-                                         order.n_exog * batch_size,
-                                         n_obs,
-                                         order.d,
-                                         order.D,
-                                         order.s,
-                                         stream);
+      d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
+  }
 
-      if (num_steps > 0) {
-        exog_fut_buffer.resize(num_steps * order.n_exog * batch_size, stream);
-
-        MLCommon::TimeSeries::prepare_future_data(exog_fut_buffer.data(),
-                                                  d_exog,
-                                                  d_exog_fut,
-                                                  order.n_exog * batch_size,
-                                                  n_obs,
-                                                  num_steps,
-                                                  order.d,
-                                                  order.D,
-                                                  order.s,
-                                                  stream);
-
-        d_exog_fut_kf = exog_fut_buffer.data();
-      }
+  template <typename T>
+  struct is_missing {
+    typedef T argument_type;
+    typedef T result_type;
+
+    __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const
+    {
+      return isnan(x);
     }
-    order_after_prep.d = 0;
-    order_after_prep.D = 0;
-
-    d_y_kf    = arima_mem.y_diff;
-    d_exog_kf = arima_mem.exog_diff;
-  } else {
-    n_obs_kf  = n_obs;
-    d_y_kf    = d_y;
-    d_exog_kf = d_exog;
-  }
+  };  // end is_missing
 
-  double* d_pred = arima_mem.pred;
-
-  // Create temporary array for the forecasts
-  rmm::device_uvector<double> fc_buffer(num_steps * batch_size, stream);
-  double* d_y_fc = fc_buffer.data();
-
-  // Compute the residual and forecast
-  std::vector<double> loglike = std::vector<double>(batch_size);
-  /// TODO: use device loglike to avoid useless copy ; part of #2233
-  batched_loglike(handle,
-                  arima_mem,
-                  d_y_kf,
-                  d_exog_kf,
-                  batch_size,
-                  n_obs_kf,
-                  order_after_prep,
-                  params,
-                  loglike.data(),
-                  false,
-                  true,
-                  MLE,
-                  0,
-                  num_steps,
-                  d_y_fc,
-                  d_exog_fut_kf,
-                  level,
-                  d_lower,
-                  d_upper);
-
-  auto counting  = thrust::make_counting_iterator(0);
-  int predict_ld = end - start;
-
-  //
-  // In-sample prediction
-  //
-
-  // The prediction loop starts by filling undefined predictions with NaN,
-  // then computes the predictions from the observations and residuals
-  if (start < n_obs) {
-    int res_offset = diff ? order.d + order.s * order.D : 0;
-    int p_start    = std::max(start, res_offset);
-    int p_end      = std::min(n_obs, end);
-    int dD         = diff ? order.d + order.D : 0;
-    int period1    = order.d ? 1 : order.s;
-    int period2    = order.d == 2 ? 1 : order.s;
+  bool detect_missing(raft::handle_t & handle, const double* d_y, int n_elem)
+  {
+    return thrust::any_of(
+      thrust::cuda::par.on(handle.get_stream()), d_y, d_y + n_elem, is_missing<double>());
+  }
 
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_y_p[0] = 0.0;
-        for (int i = 0; i < res_offset - start; i++) {
-          d_y_p[bid * predict_ld + i] = nan("");
-        }
-        for (int i = p_start; i < p_end; i++) {
-          if (dD == 0) {
-            d_y_p[bid * predict_ld + i - start] = d_pred[bid * n_obs + i];
-          } else if (dD == 1) {
-            d_y_p[bid * predict_ld + i - start] =
-              d_y[bid * n_obs + i - period1] + d_pred[bid * n_obs_kf + i - res_offset];
-          } else {
-            d_y_p[bid * predict_ld + i - start] =
-              d_y[bid * n_obs + i - period1] + d_y[bid * n_obs + i - period2] -
-              d_y[bid * n_obs + i - period1 - period2] + d_pred[bid * n_obs_kf + i - res_offset];
-          }
+  void predict(raft::handle_t & handle,
+               const ARIMAMemory<double>& arima_mem,
+               const double* d_y,
+               const double* d_exog,
+               const double* d_exog_fut,
+               int batch_size,
+               int n_obs,
+               int start,
+               int end,
+               const ARIMAOrder& order,
+               const ARIMAParams<double>& params,
+               double* d_y_p,
+               bool pre_diff,
+               double level,
+               double* d_lower,
+               double* d_upper)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+    const auto stream = handle.get_stream();
+
+    bool diff     = order.need_diff() && pre_diff && level == 0;
+    int num_steps = std::max(end - n_obs, 0);
+
+    // Prepare data
+    int n_obs_kf;
+    const double* d_y_kf;
+    const double* d_exog_kf;
+    const double* d_exog_fut_kf = d_exog_fut;
+    ARIMAOrder order_after_prep = order;
+    rmm::device_uvector<double> exog_fut_buffer(0, stream);
+    if (diff) {
+      n_obs_kf = n_obs - order.n_diff();
+      MLCommon::TimeSeries::prepare_data(
+        arima_mem.y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
+      if (order.n_exog > 0) {
+        MLCommon::TimeSeries::prepare_data(arima_mem.exog_diff,
+                                           d_exog,
+                                           order.n_exog * batch_size,
+                                           n_obs,
+                                           order.d,
+                                           order.D,
+                                           order.s,
+                                           stream);
+
+        if (num_steps > 0) {
+          exog_fut_buffer.resize(num_steps * order.n_exog * batch_size, stream);
+
+          MLCommon::TimeSeries::prepare_future_data(exog_fut_buffer.data(),
+                                                    d_exog,
+                                                    d_exog_fut,
+                                                    order.n_exog * batch_size,
+                                                    n_obs,
+                                                    num_steps,
+                                                    order.d,
+                                                    order.D,
+                                                    order.s,
+                                                    stream);
+
+          d_exog_fut_kf = exog_fut_buffer.data();
         }
-      });
-  }
+      }
+      order_after_prep.d = 0;
+      order_after_prep.D = 0;
+
+      d_y_kf    = arima_mem.y_diff;
+      d_exog_kf = arima_mem.exog_diff;
+    } else {
+      n_obs_kf  = n_obs;
+      d_y_kf    = d_y;
+      d_exog_kf = d_exog;
+    }
 
-  //
-  // Finalize out-of-sample forecast and copy in-sample predictions
-  //
+    double* d_pred = arima_mem.pred;
 
-  if (num_steps) {
-    if (diff) {
-      MLCommon::TimeSeries::finalize_forecast(
-        d_y_fc, d_y, num_steps, batch_size, n_obs, n_obs, order.d, order.D, order.s, stream);
+    // Create temporary array for the forecasts
+    rmm::device_uvector<double> fc_buffer(num_steps * batch_size, stream);
+    double* d_y_fc = fc_buffer.data();
+
+    // Compute the residual and forecast
+    std::vector<double> loglike = std::vector<double>(batch_size);
+    /// TODO: use device loglike to avoid useless copy ; part of #2233
+    batched_loglike(handle,
+                    arima_mem,
+                    d_y_kf,
+                    d_exog_kf,
+                    batch_size,
+                    n_obs_kf,
+                    order_after_prep,
+                    params,
+                    loglike.data(),
+                    false,
+                    true,
+                    MLE,
+                    0,
+                    num_steps,
+                    d_y_fc,
+                    d_exog_fut_kf,
+                    level,
+                    d_lower,
+                    d_upper);
+
+    auto counting  = thrust::make_counting_iterator(0);
+    int predict_ld = end - start;
+
+    //
+    // In-sample prediction
+    //
+
+    // The prediction loop starts by filling undefined predictions with NaN,
+    // then computes the predictions from the observations and residuals
+    if (start < n_obs) {
+      int res_offset = diff ? order.d + order.s * order.D : 0;
+      int p_start    = std::max(start, res_offset);
+      int p_end      = std::min(n_obs, end);
+      int dD         = diff ? order.d + order.D : 0;
+      int period1    = order.d ? 1 : order.s;
+      int period2    = order.d == 2 ? 1 : order.s;
+
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          d_y_p[0] = 0.0;
+          for (int i = 0; i < res_offset - start; i++) {
+            d_y_p[bid * predict_ld + i] = nan("");
+          }
+          for (int i = p_start; i < p_end; i++) {
+            if (dD == 0) {
+              d_y_p[bid * predict_ld + i - start] = d_pred[bid * n_obs + i];
+            } else if (dD == 1) {
+              d_y_p[bid * predict_ld + i - start] =
+                d_y[bid * n_obs + i - period1] + d_pred[bid * n_obs_kf + i - res_offset];
+            } else {
+              d_y_p[bid * predict_ld + i - start] =
+                d_y[bid * n_obs + i - period1] + d_y[bid * n_obs + i - period2] -
+                d_y[bid * n_obs + i - period1 - period2] + d_pred[bid * n_obs_kf + i - res_offset];
+            }
+          }
+        });
     }
 
-    // Copy forecast in d_y_p
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        for (int i = 0; i < num_steps; i++) {
-          d_y_p[bid * predict_ld + n_obs - start + i] = d_y_fc[num_steps * bid + i];
-        }
-      });
-    /// TODO: 2D copy kernel?
-  }
-}
+    //
+    // Finalize out-of-sample forecast and copy in-sample predictions
+    //
 
-/**
- * Kernel to compute the sum-of-squares log-likelihood estimation
- *
- * @param[in]  d_y        Series to fit
- * @param[in]  d_mu       mu parameters
- * @param[in]  d_ar       AR parameters
- * @param[in]  d_ma       MA parameters
- * @param[in]  d_sar      Seasonal AR parameters
- * @param[in]  d_sma      Seasonal MA parameters
- * @param[out] d_loglike  Evaluated log-likelihood
- * @param[in]  n_obs      Number of observations in a time series
- * @param[in]  n_phi      Number of phi coefficients (combined AR-SAR)
- * @param[in]  n_theta    Number of theta coefficients (combined MA-SMA)
- * @param[in]  p          Number of AR parameters
- * @param[in]  q          Number of MA parameters
- * @param[in]  P          Number of seasonal AR parameters
- * @param[in]  Q          Number of seasonal MA parameters
- * @param[in]  s          Seasonal period or 0
- * @param[in]  k          Whether to use an intercept
- * @param[in]  start_sum  At which index to start the sum
- * @param[in]  start_y    First used y index (observation)
- * @param[in]  start_v    First used v index (residual)
- */
-template <typename DataT>
-__global__ void sum_of_squares_kernel(const DataT* d_y,
-                                      const DataT* d_mu,
-                                      const DataT* d_ar,
-                                      const DataT* d_ma,
-                                      const DataT* d_sar,
-                                      const DataT* d_sma,
-                                      DataT* d_loglike,
-                                      int n_obs,
-                                      int n_phi,
-                                      int n_theta,
-                                      int p,
-                                      int q,
-                                      int P,
-                                      int Q,
-                                      int s,
-                                      int k,
-                                      int start_sum,
-                                      int start_y,
-                                      int start_v)
-{
-  // Load phi, theta and mu to registers
-  DataT phi, theta;
-  if (threadIdx.x < n_phi) {
-    phi = MLCommon::TimeSeries::reduced_polynomial<true>(
-      blockIdx.x, d_ar, p, d_sar, P, s, threadIdx.x + 1);
-  }
-  if (threadIdx.x < n_theta) {
-    theta = MLCommon::TimeSeries::reduced_polynomial<false>(
-      blockIdx.x, d_ma, q, d_sma, Q, s, threadIdx.x + 1);
-  }
-  DataT mu = k ? d_mu[blockIdx.x] : (DataT)0;
-
-  // Shared memory: load y and initialize the residuals
-  extern __shared__ DataT shared_mem[];
-  DataT* b_y  = shared_mem;
-  DataT* b_vs = shared_mem + n_obs - start_y;
-  for (int i = threadIdx.x; i < n_obs - start_y; i += blockDim.x) {
-    b_y[i] = d_y[n_obs * blockIdx.x + i + start_y];
-  }
-  for (int i = threadIdx.x; i < start_sum - start_v; i += blockDim.x) {
-    b_vs[i] = (DataT)0;
+    if (num_steps) {
+      if (diff) {
+        MLCommon::TimeSeries::finalize_forecast(
+          d_y_fc, d_y, num_steps, batch_size, n_obs, n_obs, order.d, order.D, order.s, stream);
+      }
+
+      // Copy forecast in d_y_p
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          for (int i = 0; i < num_steps; i++) {
+            d_y_p[bid * predict_ld + n_obs - start + i] = d_y_fc[num_steps * bid + i];
+          }
+        });
+      /// TODO: 2D copy kernel?
+    }
   }
 
-  // Main loop
-  char* temp_smem = (char*)(shared_mem + 2 * n_obs - start_y - start_v);
-  DataT res, ssq = 0;
-  for (int i = start_sum; i < n_obs; i++) {
-    __syncthreads();
-    res = (DataT)0;
-    res -= threadIdx.x < n_phi ? phi * b_y[i - threadIdx.x - 1 - start_y] : (DataT)0;
-    res -= threadIdx.x < n_theta ? theta * b_vs[i - threadIdx.x - 1 - start_v] : (DataT)0;
-    res = raft::blockReduce(res, temp_smem);
+  /**
+   * Kernel to compute the sum-of-squares log-likelihood estimation
+   *
+   * @param[in]  d_y        Series to fit
+   * @param[in]  d_mu       mu parameters
+   * @param[in]  d_ar       AR parameters
+   * @param[in]  d_ma       MA parameters
+   * @param[in]  d_sar      Seasonal AR parameters
+   * @param[in]  d_sma      Seasonal MA parameters
+   * @param[out] d_loglike  Evaluated log-likelihood
+   * @param[in]  n_obs      Number of observations in a time series
+   * @param[in]  n_phi      Number of phi coefficients (combined AR-SAR)
+   * @param[in]  n_theta    Number of theta coefficients (combined MA-SMA)
+   * @param[in]  p          Number of AR parameters
+   * @param[in]  q          Number of MA parameters
+   * @param[in]  P          Number of seasonal AR parameters
+   * @param[in]  Q          Number of seasonal MA parameters
+   * @param[in]  s          Seasonal period or 0
+   * @param[in]  k          Whether to use an intercept
+   * @param[in]  start_sum  At which index to start the sum
+   * @param[in]  start_y    First used y index (observation)
+   * @param[in]  start_v    First used v index (residual)
+   */
+  template <typename DataT>
+  __global__ void sum_of_squares_kernel(const DataT* d_y,
+                                        const DataT* d_mu,
+                                        const DataT* d_ar,
+                                        const DataT* d_ma,
+                                        const DataT* d_sar,
+                                        const DataT* d_sma,
+                                        DataT* d_loglike,
+                                        int n_obs,
+                                        int n_phi,
+                                        int n_theta,
+                                        int p,
+                                        int q,
+                                        int P,
+                                        int Q,
+                                        int s,
+                                        int k,
+                                        int start_sum,
+                                        int start_y,
+                                        int start_v)
+  {
+    // Load phi, theta and mu to registers
+    DataT phi, theta;
+    if (threadIdx.x < n_phi) {
+      phi = MLCommon::TimeSeries::reduced_polynomial<true>(
+        blockIdx.x, d_ar, p, d_sar, P, s, threadIdx.x + 1);
+    }
+    if (threadIdx.x < n_theta) {
+      theta = MLCommon::TimeSeries::reduced_polynomial<false>(
+        blockIdx.x, d_ma, q, d_sma, Q, s, threadIdx.x + 1);
+    }
+    DataT mu = k ? d_mu[blockIdx.x] : (DataT)0;
+
+    // Shared memory: load y and initialize the residuals
+    extern __shared__ DataT shared_mem[];
+    DataT* b_y  = shared_mem;
+    DataT* b_vs = shared_mem + n_obs - start_y;
+    for (int i = threadIdx.x; i < n_obs - start_y; i += blockDim.x) {
+      b_y[i] = d_y[n_obs * blockIdx.x + i + start_y];
+    }
+    for (int i = threadIdx.x; i < start_sum - start_v; i += blockDim.x) {
+      b_vs[i] = (DataT)0;
+    }
+
+    // Main loop
+    char* temp_smem = (char*)(shared_mem + 2 * n_obs - start_y - start_v);
+    DataT res, ssq = 0;
+    for (int i = start_sum; i < n_obs; i++) {
+      __syncthreads();
+      res = (DataT)0;
+      res -= threadIdx.x < n_phi ? phi * b_y[i - threadIdx.x - 1 - start_y] : (DataT)0;
+      res -= threadIdx.x < n_theta ? theta * b_vs[i - threadIdx.x - 1 - start_v] : (DataT)0;
+      res = raft::blockReduce(res, temp_smem);
+      if (threadIdx.x == 0) {
+        res += b_y[i - start_y] - mu;
+        b_vs[i - start_v] = res;
+        ssq += res * res;
+      }
+    }
+
+    // Compute log-likelihood and write it to global memory
     if (threadIdx.x == 0) {
-      res += b_y[i - start_y] - mu;
-      b_vs[i - start_v] = res;
-      ssq += res * res;
+      d_loglike[blockIdx.x] =
+        -0.5 * static_cast<DataT>(n_obs) * raft::myLog(ssq / static_cast<DataT>(n_obs - start_sum));
     }
   }
 
-  // Compute log-likelihood and write it to global memory
-  if (threadIdx.x == 0) {
-    d_loglike[blockIdx.x] =
-      -0.5 * static_cast<DataT>(n_obs) * raft::myLog(ssq / static_cast<DataT>(n_obs - start_sum));
+  /**
+   * Sum-of-squares estimation method
+   *
+   * @param[in]  handle     cuML handle
+   * @param[in]  d_y        Series to fit: shape = (n_obs, batch_size)
+   * @param[in]  batch_size Number of time series
+   * @param[in]  n_obs      Number of observations in a time series
+   * @param[in]  order      ARIMA hyper-parameters
+   * @param[in]  Tparams    Transformed parameters
+   * @param[out] d_loglike  Evaluated log-likelihood (device)
+   * @param[in]  truncate   Number of observations to skip in the sum
+   */
+  void conditional_sum_of_squares(raft::handle_t & handle,
+                                  const double* d_y,
+                                  int batch_size,
+                                  int n_obs,
+                                  const ARIMAOrder& order,
+                                  const ARIMAParams<double>& Tparams,
+                                  double* d_loglike,
+                                  int truncate)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+    auto stream = handle.get_stream();
+
+    int n_phi     = order.n_phi();
+    int n_theta   = order.n_theta();
+    int max_lags  = std::max(n_phi, n_theta);
+    int start_sum = std::max(max_lags, truncate);
+    int start_y   = start_sum - n_phi;
+    int start_v   = start_sum - n_theta;
+
+    // Compute the sum-of-squares and the log-likelihood
+    int n_warps            = std::max(raft::ceildiv<int>(max_lags, 32), 1);
+    size_t shared_mem_size = (2 * n_obs - start_y - start_v + n_warps) * sizeof(double);
+    sum_of_squares_kernel<<<batch_size, 32 * n_warps, shared_mem_size, stream>>>(d_y,
+                                                                                 Tparams.mu,
+                                                                                 Tparams.ar,
+                                                                                 Tparams.ma,
+                                                                                 Tparams.sar,
+                                                                                 Tparams.sma,
+                                                                                 d_loglike,
+                                                                                 n_obs,
+                                                                                 n_phi,
+                                                                                 n_theta,
+                                                                                 order.p,
+                                                                                 order.q,
+                                                                                 order.P,
+                                                                                 order.Q,
+                                                                                 order.s,
+                                                                                 order.k,
+                                                                                 start_sum,
+                                                                                 start_y,
+                                                                                 start_v);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
-}
 
-/**
- * Sum-of-squares estimation method
- *
- * @param[in]  handle     cuML handle
- * @param[in]  d_y        Series to fit: shape = (n_obs, batch_size)
- * @param[in]  batch_size Number of time series
- * @param[in]  n_obs      Number of observations in a time series
- * @param[in]  order      ARIMA hyper-parameters
- * @param[in]  Tparams    Transformed parameters
- * @param[out] d_loglike  Evaluated log-likelihood (device)
- * @param[in]  truncate   Number of observations to skip in the sum
- */
-void conditional_sum_of_squares(raft::handle_t& handle,
-                                const double* d_y,
-                                int batch_size,
-                                int n_obs,
-                                const ARIMAOrder& order,
-                                const ARIMAParams<double>& Tparams,
-                                double* d_loglike,
-                                int truncate)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  auto stream = handle.get_stream();
-
-  int n_phi     = order.n_phi();
-  int n_theta   = order.n_theta();
-  int max_lags  = std::max(n_phi, n_theta);
-  int start_sum = std::max(max_lags, truncate);
-  int start_y   = start_sum - n_phi;
-  int start_v   = start_sum - n_theta;
-
-  // Compute the sum-of-squares and the log-likelihood
-  int n_warps            = std::max(raft::ceildiv<int>(max_lags, 32), 1);
-  size_t shared_mem_size = (2 * n_obs - start_y - start_v + n_warps) * sizeof(double);
-  sum_of_squares_kernel<<<batch_size, 32 * n_warps, shared_mem_size, stream>>>(d_y,
-                                                                               Tparams.mu,
-                                                                               Tparams.ar,
-                                                                               Tparams.ma,
-                                                                               Tparams.sar,
-                                                                               Tparams.sma,
-                                                                               d_loglike,
-                                                                               n_obs,
-                                                                               n_phi,
-                                                                               n_theta,
-                                                                               order.p,
-                                                                               order.q,
-                                                                               order.P,
-                                                                               order.Q,
-                                                                               order.s,
-                                                                               order.k,
-                                                                               start_sum,
-                                                                               start_y,
-                                                                               start_v);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-void batched_loglike(raft::handle_t& handle,
-                     const ARIMAMemory<double>& arima_mem,
-                     const double* d_y,
-                     const double* d_exog,
-                     int batch_size,
-                     int n_obs,
-                     const ARIMAOrder& order,
-                     const ARIMAParams<double>& params,
-                     double* loglike,
-                     bool trans,
-                     bool host_loglike,
-                     LoglikeMethod method,
-                     int truncate,
-                     int fc_steps,
-                     double* d_fc,
-                     const double* d_exog_fut,
-                     double level,
-                     double* d_lower,
-                     double* d_upper)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-
-  auto stream = handle.get_stream();
-
-  double* d_pred = arima_mem.pred;
-
-  ARIMAParams<double> Tparams = {params.mu,
-                                 params.beta,
-                                 arima_mem.Tparams_ar,
-                                 arima_mem.Tparams_ma,
-                                 arima_mem.Tparams_sar,
-                                 arima_mem.Tparams_sma,
-                                 arima_mem.Tparams_sigma2};
-
-  ASSERT(method == MLE || fc_steps == 0, "Only MLE method is valid for forecasting");
-
-  /* Create log-likelihood device array if host pointer is provided */
-  double* d_loglike = host_loglike ? arima_mem.loglike : loglike;
-
-  if (trans) {
-    MLCommon::TimeSeries::batched_jones_transform(
-      order, batch_size, false, params, Tparams, stream);
-  } else {
-    // non-transformed case: just use original parameters
-    Tparams.ar     = params.ar;
-    Tparams.ma     = params.ma;
-    Tparams.sar    = params.sar;
-    Tparams.sma    = params.sma;
-    Tparams.sigma2 = params.sigma2;
-  }
+  void batched_loglike(raft::handle_t & handle,
+                       const ARIMAMemory<double>& arima_mem,
+                       const double* d_y,
+                       const double* d_exog,
+                       int batch_size,
+                       int n_obs,
+                       const ARIMAOrder& order,
+                       const ARIMAParams<double>& params,
+                       double* loglike,
+                       bool trans,
+                       bool host_loglike,
+                       LoglikeMethod method,
+                       int truncate,
+                       int fc_steps,
+                       double* d_fc,
+                       const double* d_exog_fut,
+                       double level,
+                       double* d_lower,
+                       double* d_upper)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+
+    auto stream = handle.get_stream();
+
+    double* d_pred = arima_mem.pred;
+
+    ARIMAParams<double> Tparams = {params.mu,
+                                   params.beta,
+                                   arima_mem.Tparams_ar,
+                                   arima_mem.Tparams_ma,
+                                   arima_mem.Tparams_sar,
+                                   arima_mem.Tparams_sma,
+                                   arima_mem.Tparams_sigma2};
+
+    ASSERT(method == MLE || fc_steps == 0, "Only MLE method is valid for forecasting");
+
+    /* Create log-likelihood device array if host pointer is provided */
+    double* d_loglike = host_loglike ? arima_mem.loglike : loglike;
+
+    if (trans) {
+      MLCommon::TimeSeries::batched_jones_transform(
+        order, batch_size, false, params, Tparams, stream);
+    } else {
+      // non-transformed case: just use original parameters
+      Tparams.ar     = params.ar;
+      Tparams.ma     = params.ma;
+      Tparams.sar    = params.sar;
+      Tparams.sma    = params.sma;
+      Tparams.sigma2 = params.sigma2;
+    }
+
+    if (method == CSS) {
+      conditional_sum_of_squares(
+        handle, d_y, batch_size, n_obs, order, Tparams, d_loglike, truncate);
+    } else {
+      batched_kalman_filter(handle,
+                            arima_mem,
+                            d_y,
+                            d_exog,
+                            n_obs,
+                            Tparams,
+                            order,
+                            batch_size,
+                            d_loglike,
+                            d_pred,
+                            fc_steps,
+                            d_fc,
+                            d_exog_fut,
+                            level,
+                            d_lower,
+                            d_upper);
+    }
 
-  if (method == CSS) {
-    conditional_sum_of_squares(handle, d_y, batch_size, n_obs, order, Tparams, d_loglike, truncate);
-  } else {
-    batched_kalman_filter(handle,
-                          arima_mem,
-                          d_y,
-                          d_exog,
-                          n_obs,
-                          Tparams,
-                          order,
-                          batch_size,
-                          d_loglike,
-                          d_pred,
-                          fc_steps,
-                          d_fc,
-                          d_exog_fut,
-                          level,
-                          d_lower,
-                          d_upper);
+    if (host_loglike) {
+      /* Tranfer log-likelihood device -> host */
+      raft::update_host(loglike, d_loglike, batch_size, stream);
+    }
   }
 
-  if (host_loglike) {
-    /* Tranfer log-likelihood device -> host */
-    raft::update_host(loglike, d_loglike, batch_size, stream);
+  void batched_loglike(raft::handle_t & handle,
+                       const ARIMAMemory<double>& arima_mem,
+                       const double* d_y,
+                       const double* d_exog,
+                       int batch_size,
+                       int n_obs,
+                       const ARIMAOrder& order,
+                       const double* d_params,
+                       double* loglike,
+                       bool trans,
+                       bool host_loglike,
+                       LoglikeMethod method,
+                       int truncate)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+
+    // unpack parameters
+    auto stream = handle.get_stream();
+
+    ARIMAParams<double> params = {arima_mem.params_mu,
+                                  arima_mem.params_beta,
+                                  arima_mem.params_ar,
+                                  arima_mem.params_ma,
+                                  arima_mem.params_sar,
+                                  arima_mem.params_sma,
+                                  arima_mem.params_sigma2};
+
+    params.unpack(order, batch_size, d_params, stream);
+
+    batched_loglike(handle,
+                    arima_mem,
+                    d_y,
+                    d_exog,
+                    batch_size,
+                    n_obs,
+                    order,
+                    params,
+                    loglike,
+                    trans,
+                    host_loglike,
+                    method,
+                    truncate);
   }
-}
-
-void batched_loglike(raft::handle_t& handle,
-                     const ARIMAMemory<double>& arima_mem,
-                     const double* d_y,
-                     const double* d_exog,
-                     int batch_size,
-                     int n_obs,
-                     const ARIMAOrder& order,
-                     const double* d_params,
-                     double* loglike,
-                     bool trans,
-                     bool host_loglike,
-                     LoglikeMethod method,
-                     int truncate)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-
-  // unpack parameters
-  auto stream = handle.get_stream();
-
-  ARIMAParams<double> params = {arima_mem.params_mu,
-                                arima_mem.params_beta,
-                                arima_mem.params_ar,
-                                arima_mem.params_ma,
-                                arima_mem.params_sar,
-                                arima_mem.params_sma,
-                                arima_mem.params_sigma2};
-
-  params.unpack(order, batch_size, d_params, stream);
-
-  batched_loglike(handle,
-                  arima_mem,
-                  d_y,
-                  d_exog,
-                  batch_size,
-                  n_obs,
-                  order,
-                  params,
-                  loglike,
-                  trans,
-                  host_loglike,
-                  method,
-                  truncate);
-}
-
-void batched_loglike_grad(raft::handle_t& handle,
-                          const ARIMAMemory<double>& arima_mem,
-                          const double* d_y,
-                          const double* d_exog,
-                          int batch_size,
-                          int n_obs,
-                          const ARIMAOrder& order,
-                          const double* d_x,
-                          double* d_grad,
-                          double h,
-                          bool trans,
-                          LoglikeMethod method,
-                          int truncate)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  auto stream   = handle.get_stream();
-  auto counting = thrust::make_counting_iterator(0);
-  int N         = order.complexity();
-
-  // Initialize the perturbed x vector
-  double* d_x_pert = arima_mem.x_pert;
-  raft::copy(d_x_pert, d_x, N * batch_size, stream);
-
-  double* d_ll_base = arima_mem.loglike_base;
-  double* d_ll_pert = arima_mem.loglike_pert;
-
-  // Evaluate the log-likelihood with the given parameter vector
-  batched_loglike(handle,
-                  arima_mem,
-                  d_y,
-                  d_exog,
-                  batch_size,
-                  n_obs,
-                  order,
-                  d_x,
-                  d_ll_base,
-                  trans,
-                  false,
-                  method,
-                  truncate);
-
-  for (int i = 0; i < N; i++) {
-    // Add the perturbation to the i-th parameter
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_x_pert[N * bid + i] = d_x[N * bid + i] + h;
-      });
 
-    // Evaluate the log-likelihood with the positive perturbation
+  void batched_loglike_grad(raft::handle_t & handle,
+                            const ARIMAMemory<double>& arima_mem,
+                            const double* d_y,
+                            const double* d_exog,
+                            int batch_size,
+                            int n_obs,
+                            const ARIMAOrder& order,
+                            const double* d_x,
+                            double* d_grad,
+                            double h,
+                            bool trans,
+                            LoglikeMethod method,
+                            int truncate)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+    auto stream   = handle.get_stream();
+    auto counting = thrust::make_counting_iterator(0);
+    int N         = order.complexity();
+
+    // Initialize the perturbed x vector
+    double* d_x_pert = arima_mem.x_pert;
+    raft::copy(d_x_pert, d_x, N * batch_size, stream);
+
+    double* d_ll_base = arima_mem.loglike_base;
+    double* d_ll_pert = arima_mem.loglike_pert;
+
+    // Evaluate the log-likelihood with the given parameter vector
     batched_loglike(handle,
                     arima_mem,
                     d_y,
@@ -582,430 +564,452 @@ void batched_loglike_grad(raft::handle_t& handle,
                     batch_size,
                     n_obs,
                     order,
-                    d_x_pert,
-                    d_ll_pert,
+                    d_x,
+                    d_ll_base,
                     trans,
                     false,
                     method,
                     truncate);
 
-    // First derivative with a first-order accuracy
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
-      });
+    for (int i = 0; i < N; i++) {
+      // Add the perturbation to the i-th parameter
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          d_x_pert[N * bid + i] = d_x[N * bid + i] + h;
+        });
 
-    // Reset the i-th parameter
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_x_pert[N * bid + i] = d_x[N * bid + i];
-      });
-  }
-}
-
-void information_criterion(raft::handle_t& handle,
-                           const ARIMAMemory<double>& arima_mem,
-                           const double* d_y,
-                           const double* d_exog,
-                           int batch_size,
-                           int n_obs,
-                           const ARIMAOrder& order,
-                           const ARIMAParams<double>& params,
-                           double* d_ic,
-                           int ic_type)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  auto stream = handle.get_stream();
-
-  /* Compute log-likelihood in d_ic */
-  batched_loglike(
-    handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
-
-  /* Compute information criterion from log-likelihood and base term */
-  raft::stats::information_criterion_batched(d_ic,
-                                             d_ic,
-                                             static_cast<raft::stats::IC_Type>(ic_type),
-                                             order.complexity(),
-                                             batch_size,
-                                             n_obs - order.n_diff(),
-                                             stream);
-}
-
-/**
- * Test that the parameters are valid for the inverse transform
- *
- * @tparam isAr        Are these (S)AR or (S)MA parameters?
- * @param[in]  params  Parameters
- * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
- */
-template <bool isAr>
-DI bool test_invparams(const double* params, int pq)
-{
-  double new_params[8];
-  double tmp[8];
+      // Evaluate the log-likelihood with the positive perturbation
+      batched_loglike(handle,
+                      arima_mem,
+                      d_y,
+                      d_exog,
+                      batch_size,
+                      n_obs,
+                      order,
+                      d_x_pert,
+                      d_ll_pert,
+                      trans,
+                      false,
+                      method,
+                      truncate);
+
+      // First derivative with a first-order accuracy
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
+        });
 
-  constexpr double coef = isAr ? 1 : -1;
+      // Reset the i-th parameter
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          d_x_pert[N * bid + i] = d_x[N * bid + i];
+        });
+    }
+  }
 
-  for (int i = 0; i < pq; i++) {
-    tmp[i]        = params[i];
-    new_params[i] = tmp[i];
+  void information_criterion(raft::handle_t & handle,
+                             const ARIMAMemory<double>& arima_mem,
+                             const double* d_y,
+                             const double* d_exog,
+                             int batch_size,
+                             int n_obs,
+                             const ARIMAOrder& order,
+                             const ARIMAParams<double>& params,
+                             double* d_ic,
+                             int ic_type)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+    auto stream = handle.get_stream();
+
+    /* Compute log-likelihood in d_ic */
+    batched_loglike(
+      handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
+
+    /* Compute information criterion from log-likelihood and base term */
+    raft::stats::information_criterion_batched(d_ic,
+                                               d_ic,
+                                               static_cast<raft::stats::IC_Type>(ic_type),
+                                               order.complexity(),
+                                               batch_size,
+                                               n_obs - order.n_diff(),
+                                               stream);
   }
 
-  // Perform inverse transform and stop before atanh step
-  for (int j = pq - 1; j > 0; --j) {
-    double a = new_params[j];
-    for (int k = 0; k < j; ++k) {
-      tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
+  /**
+   * Test that the parameters are valid for the inverse transform
+   *
+   * @tparam isAr        Are these (S)AR or (S)MA parameters?
+   * @param[in]  params  Parameters
+   * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
+   */
+  template <bool isAr>
+  DI bool test_invparams(const double* params, int pq)
+  {
+    double new_params[8];
+    double tmp[8];
+
+    constexpr double coef = isAr ? 1 : -1;
+
+    for (int i = 0; i < pq; i++) {
+      tmp[i]        = params[i];
+      new_params[i] = tmp[i];
     }
-    for (int iter = 0; iter < j; ++iter) {
-      new_params[iter] = tmp[iter];
+
+    // Perform inverse transform and stop before atanh step
+    for (int j = pq - 1; j > 0; --j) {
+      double a = new_params[j];
+      for (int k = 0; k < j; ++k) {
+        tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
+      }
+      for (int iter = 0; iter < j; ++iter) {
+        new_params[iter] = tmp[iter];
+      }
     }
-  }
 
-  // Verify that the values are between -1 and 1
-  bool result = true;
-  for (int i = 0; i < pq; i++) {
-    result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
+    // Verify that the values are between -1 and 1
+    bool result = true;
+    for (int i = 0; i < pq; i++) {
+      result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
+    }
+    return result;
   }
-  return result;
-}
 
-/**
- * Auxiliary function of _start_params: least square approximation of an
- * ARMA model (with or without seasonality)
- * @note: in this function the non-seasonal case has s=1, not s=0!
- */
-void _arma_least_squares(raft::handle_t& handle,
-                         double* d_ar,
-                         double* d_ma,
-                         double* d_sigma2,
-                         const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
-                         int p,
-                         int q,
-                         int s,
-                         bool estimate_sigma2,
-                         int k        = 0,
-                         double* d_mu = nullptr)
-{
-  const auto& handle_impl = handle;
-  auto stream             = handle_impl.get_stream();
-  auto cublas_handle      = handle_impl.get_cublas_handle();
-  auto counting           = thrust::make_counting_iterator(0);
-
-  int batch_size = bm_y.batches();
-  int n_obs      = bm_y.shape().first;
-
-  int ps = p * s, qs = q * s;
-  int p_ar = std::max(ps, 2 * qs);
-  int r    = std::max(p_ar + qs, ps);
-
-  if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
-    // Too few observations for the estimate, fill with 0 (1 for sigma2)
-    if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
-    if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
-    if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
-    if (estimate_sigma2) {
-      thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
-      thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
+  /**
+   * Auxiliary function of _start_params: least square approximation of an
+   * ARMA model (with or without seasonality)
+   * @note: in this function the non-seasonal case has s=1, not s=0!
+   */
+  void _arma_least_squares(raft::handle_t & handle,
+                           double* d_ar,
+                           double* d_ma,
+                           double* d_sigma2,
+                           const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
+                           int p,
+                           int q,
+                           int s,
+                           bool estimate_sigma2,
+                           int k        = 0,
+                           double* d_mu = nullptr)
+  {
+    const auto& handle_impl = handle;
+    auto stream             = handle_impl.get_stream();
+    auto cublas_handle      = handle_impl.get_cublas_handle();
+    auto counting           = thrust::make_counting_iterator(0);
+
+    int batch_size = bm_y.batches();
+    int n_obs      = bm_y.shape().first;
+
+    int ps = p * s, qs = q * s;
+    int p_ar = std::max(ps, 2 * qs);
+    int r    = std::max(p_ar + qs, ps);
+
+    if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
+      // Too few observations for the estimate, fill with 0 (1 for sigma2)
+      if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
+      if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
+      if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
+      if (estimate_sigma2) {
+        thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
+        thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
+      }
+      return;
     }
-    return;
-  }
 
-  /* Matrix formed by lag matrices of y and the residuals respectively,
-   * side by side. The left side will be used to estimate AR, the right
-   * side to estimate MA */
-  MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
-    n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
-  int ar_offset  = r - ps;
-  int res_offset = r - p_ar - qs;
-
-  // Get residuals from an AR(p_ar) model to estimate the MA parameters
-  if (q) {
-    // Create lagged y
-    int ls_height = n_obs - p_ar;
-    MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
-      MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
-
-    /* Matrix for the initial AR fit, initialized by copy of y
-     * (note: this is because gels works in-place ; the matrix has larger
-     *  dimensions than the actual AR fit) */
-    MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
-      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
-
-    // Residual, initialized as offset y to avoid one kernel call
-    MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
-
-    // Initial AR fit
-    MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
-
-    // Compute residual (technically a gemv)
-    MLCommon::LinAlg::Batched::b_gemm(
-      false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
-
-    // Lags of the residual
+    /* Matrix formed by lag matrices of y and the residuals respectively,
+     * side by side. The left side will be used to estimate AR, the right
+     * side to estimate MA */
+    MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
+      n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
+    int ar_offset  = r - ps;
+    int res_offset = r - p_ar - qs;
+
+    // Get residuals from an AR(p_ar) model to estimate the MA parameters
+    if (q) {
+      // Create lagged y
+      int ls_height = n_obs - p_ar;
+      MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
+        MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
+
+      /* Matrix for the initial AR fit, initialized by copy of y
+       * (note: this is because gels works in-place ; the matrix has larger
+       *  dimensions than the actual AR fit) */
+      MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
+        MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
+
+      // Residual, initialized as offset y to avoid one kernel call
+      MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
+
+      // Initial AR fit
+      MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
+
+      // Compute residual (technically a gemv)
+      MLCommon::LinAlg::Batched::b_gemm(
+        false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
+
+      // Lags of the residual
+      MLCommon::LinAlg::Batched::b_lagged_mat(
+        bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
+    }
+
+    // Fill the first column of the matrix with 1 if we fit an intercept
+    if (k) {
+      double* d_ls_ar_res = bm_ls_ar_res.raw_data();
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
+          for (int i = 0; i < n_obs - r; i++) {
+            b_ls_ar_res[i] = 1.0;
+          }
+        });
+    }
+
+    // Lags of y
     MLCommon::LinAlg::Batched::b_lagged_mat(
-      bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
-  }
+      bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
+
+    /* Initializing the vector for the ARMA fit
+     * (note: also in-place as described for AR fit) */
+    MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
+      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
+
+    // The residuals will be computed only if sigma2 is requested
+    MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
+      n_obs - r, 1, batch_size, cublas_handle, stream, false);
+    if (estimate_sigma2) {
+      raft::copy(
+        bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
+    }
+
+    // ARMA fit
+    MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
 
-  // Fill the first column of the matrix with 1 if we fit an intercept
-  if (k) {
-    double* d_ls_ar_res = bm_ls_ar_res.raw_data();
+    // Copy the results in the parameter vectors
+    const double* d_arma_fit = bm_arma_fit.raw_data();
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
-        for (int i = 0; i < n_obs - r; i++) {
-          b_ls_ar_res[i] = 1.0;
+        const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
+        if (k) { d_mu[bid] = b_arma_fit[0]; }
+        if (p) {
+          double* b_ar = d_ar + bid * p;
+          for (int i = 0; i < p; i++) {
+            b_ar[i] = b_arma_fit[i + k];
+          }
+        }
+        if (q) {
+          double* b_ma = d_ma + bid * q;
+          for (int i = 0; i < q; i++) {
+            b_ma[i] = b_arma_fit[i + p + k];
+          }
         }
       });
-  }
 
-  // Lags of y
-  MLCommon::LinAlg::Batched::b_lagged_mat(
-    bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
-
-  /* Initializing the vector for the ARMA fit
-   * (note: also in-place as described for AR fit) */
-  MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
-    MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
-
-  // The residuals will be computed only if sigma2 is requested
-  MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
-    n_obs - r, 1, batch_size, cublas_handle, stream, false);
-  if (estimate_sigma2) {
-    raft::copy(
-      bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
-  }
+    if (estimate_sigma2) {
+      // Compute final residual (technically a gemv)
+      MLCommon::LinAlg::Batched::b_gemm(false,
+                                        false,
+                                        n_obs - r,
+                                        1,
+                                        p + q + k,
+                                        -1.0,
+                                        bm_ls_ar_res,
+                                        bm_arma_fit,
+                                        1.0,
+                                        bm_final_residual);
+
+      // Compute variance
+      double* d_residual = bm_final_residual.raw_data();
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          double acc               = 0.0;
+          const double* b_residual = d_residual + (n_obs - r) * bid;
+          for (int i = q; i < n_obs - r; i++) {
+            double res = b_residual[i];
+            acc += res * res;
+          }
+          d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
+        });
+    }
 
-  // ARMA fit
-  MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
-
-  // Copy the results in the parameter vectors
-  const double* d_arma_fit = bm_arma_fit.raw_data();
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
-      if (k) { d_mu[bid] = b_arma_fit[0]; }
-      if (p) {
-        double* b_ar = d_ar + bid * p;
-        for (int i = 0; i < p; i++) {
-          b_ar[i] = b_arma_fit[i + k];
-        }
-      }
-      if (q) {
-        double* b_ma = d_ma + bid * q;
-        for (int i = 0; i < q; i++) {
-          b_ma[i] = b_arma_fit[i + p + k];
-        }
-      }
-    });
-
-  if (estimate_sigma2) {
-    // Compute final residual (technically a gemv)
-    MLCommon::LinAlg::Batched::b_gemm(false,
-                                      false,
-                                      n_obs - r,
-                                      1,
-                                      p + q + k,
-                                      -1.0,
-                                      bm_ls_ar_res,
-                                      bm_arma_fit,
-                                      1.0,
-                                      bm_final_residual);
-
-    // Compute variance
-    double* d_residual = bm_final_residual.raw_data();
+    // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        double acc               = 0.0;
-        const double* b_residual = d_residual + (n_obs - r) * bid;
-        for (int i = q; i < n_obs - r; i++) {
-          double res = b_residual[i];
-          acc += res * res;
+        if (p) {
+          double* b_ar = d_ar + bid * p;
+          bool valid   = test_invparams<true>(b_ar, p);
+          if (!valid) {
+            for (int ip = 0; ip < p; ip++)
+              b_ar[ip] = 0;
+          }
+        }
+        if (q) {
+          double* b_ma = d_ma + bid * q;
+          bool valid   = test_invparams<false>(b_ma, q);
+          if (!valid) {
+            for (int iq = 0; iq < q; iq++)
+              b_ma[iq] = 0;
+          }
         }
-        d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
       });
   }
 
-  // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      if (p) {
-        double* b_ar = d_ar + bid * p;
-        bool valid   = test_invparams<true>(b_ar, p);
-        if (!valid) {
-          for (int ip = 0; ip < p; ip++)
-            b_ar[ip] = 0;
-        }
+  /**
+   * Auxiliary function of estimate_x0: compute the starting parameters for
+   * the series pre-processed by estimate_x0
+   */
+  void _start_params(raft::handle_t & handle,
+                     ARIMAParams<double> & params,
+                     MLCommon::LinAlg::Batched::Matrix<double> & bm_y,
+                     const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
+                     const ARIMAOrder& order)
+  {
+    int batch_size      = bm_exog.batches();
+    cudaStream_t stream = bm_exog.stream();
+
+    // Estimate exog coefficients and subtract component to endog.
+    // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
+    if (order.n_exog > 0) {
+      // In most cases, the system will be overdetermined and we can use gels
+      if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
+        // Make a copy of the exogenous series for in-place gels
+        MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
+        // Make a copy of the endogenous series for in-place gels
+        MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
+
+        // Least-squares solution of overdetermined system
+        rmm::device_uvector<int> info(batch_size, stream);
+        b_gels(bm_exog_copy, bm_y_copy, info.data());
+
+        // Make a batched matrix around the exogenous coefficients
+        rmm::device_uvector<double*> beta_pointers(batch_size, stream);
+        MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
+                                                               1,
+                                                               batch_size,
+                                                               bm_exog.cublasHandle(),
+                                                               beta_pointers.data(),
+                                                               params.beta,
+                                                               stream,
+                                                               false);
+
+        // Copy the solution of the system to the parameters array
+        b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
+
+        // Set parameters to zero when solving was not successful
+        auto counting       = thrust::make_counting_iterator(0);
+        int* devInfoArray   = info.data();
+        double* d_exog_coef = bm_exog_coef.raw_data();
+        const int& n_exog   = order.n_exog;
+        thrust::for_each(
+          thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+            if (devInfoArray[bid] > 0) {
+              for (int i = 0; i < n_exog; i++) {
+                d_exog_coef[bid * n_exog + i] = 0.0;
+              }
+            }
+          });
+
+        // Compute exogenous component and store the result in bm_y_copy
+        b_gemm(false,
+               false,
+               bm_exog.shape().first,
+               1,
+               bm_exog.shape().second,
+               1.0,
+               bm_exog,
+               bm_exog_coef,
+               0.0,
+               bm_y_copy);
+
+        // Subtract exogenous component to endogenous variable
+        b_aA_op_B(bm_y, bm_y_copy, bm_y, [] __device__(double a, double b) { return a - b; });
       }
-      if (q) {
-        double* b_ma = d_ma + bid * q;
-        bool valid   = test_invparams<false>(b_ma, q);
-        if (!valid) {
-          for (int iq = 0; iq < q; iq++)
-            b_ma[iq] = 0;
-        }
+      // In other cases, we initialize to zero
+      else {
+        RAFT_CUDA_TRY(
+          cudaMemsetAsync(params.beta, 0, order.n_exog * batch_size * sizeof(double), stream));
       }
-    });
-}
+    }
 
-/**
- * Auxiliary function of estimate_x0: compute the starting parameters for
- * the series pre-processed by estimate_x0
- */
-void _start_params(raft::handle_t& handle,
-                   ARIMAParams<double>& params,
-                   MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
-                   const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
-                   const ARIMAOrder& order)
-{
-  int batch_size      = bm_exog.batches();
-  cudaStream_t stream = bm_exog.stream();
-
-  // Estimate exog coefficients and subtract component to endog.
-  // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
-  if (order.n_exog > 0) {
-    // In most cases, the system will be overdetermined and we can use gels
-    if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
-      // Make a copy of the exogenous series for in-place gels
-      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
-      // Make a copy of the endogenous series for in-place gels
-      MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
-
-      // Least-squares solution of overdetermined system
-      rmm::device_uvector<int> info(batch_size, stream);
-      b_gels(bm_exog_copy, bm_y_copy, info.data());
-
-      // Make a batched matrix around the exogenous coefficients
-      rmm::device_uvector<double*> beta_pointers(batch_size, stream);
-      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
-                                                             1,
-                                                             batch_size,
-                                                             bm_exog.cublasHandle(),
-                                                             beta_pointers.data(),
-                                                             params.beta,
-                                                             stream,
-                                                             false);
-
-      // Copy the solution of the system to the parameters array
-      b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
-
-      // Set parameters to zero when solving was not successful
-      auto counting       = thrust::make_counting_iterator(0);
-      int* devInfoArray   = info.data();
-      double* d_exog_coef = bm_exog_coef.raw_data();
-      const int& n_exog   = order.n_exog;
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          if (devInfoArray[bid] > 0) {
-            for (int i = 0; i < n_exog; i++) {
-              d_exog_coef[bid * n_exog + i] = 0.0;
-            }
-          }
-        });
+    // Estimate an ARMA fit without seasonality
+    if (order.p + order.q + order.k)
+      _arma_least_squares(handle,
+                          params.ar,
+                          params.ma,
+                          params.sigma2,
+                          bm_y,
+                          order.p,
+                          order.q,
+                          1,
+                          true,
+                          order.k,
+                          params.mu);
+
+    // Estimate a seasonal ARMA fit independantly
+    if (order.P + order.Q)
+      _arma_least_squares(handle,
+                          params.sar,
+                          params.sma,
+                          params.sigma2,
+                          bm_y,
+                          order.P,
+                          order.Q,
+                          order.s,
+                          order.p + order.q + order.k == 0);
+  }
 
-      // Compute exogenous component and store the result in bm_y_copy
-      b_gemm(false,
-             false,
-             bm_exog.shape().first,
-             1,
-             bm_exog.shape().second,
-             1.0,
-             bm_exog,
-             bm_exog_coef,
-             0.0,
-             bm_y_copy);
-
-      // Subtract exogenous component to endogenous variable
-      b_aA_op_B(bm_y, bm_y_copy, bm_y, [] __device__(double a, double b) { return a - b; });
-    }
-    // In other cases, we initialize to zero
-    else {
-      RAFT_CUDA_TRY(
-        cudaMemsetAsync(params.beta, 0, order.n_exog * batch_size * sizeof(double), stream));
+  void estimate_x0(raft::handle_t & handle,
+                   ARIMAParams<double> & params,
+                   const double* d_y,
+                   const double* d_exog,
+                   int batch_size,
+                   int n_obs,
+                   const ARIMAOrder& order,
+                   bool missing)
+  {
+    raft::common::nvtx::range fun_scope(__func__);
+    const auto& handle_impl = handle;
+    auto stream             = handle_impl.get_stream();
+    auto cublas_handle      = handle_impl.get_cublas_handle();
+
+    /// TODO: solve exogenous coefficients with only valid rows instead of interpolation?
+    // Pros: better coefficients
+    // Cons: harder to test, a bit more complicated
+
+    // Least squares can't deal with missing values: create copy with naive
+    // replacements for missing values
+    const double* d_y_no_missing;
+    rmm::device_uvector<double> y_no_missing(0, stream);
+    if (missing) {
+      y_no_missing.resize(n_obs * batch_size, stream);
+      d_y_no_missing = y_no_missing.data();
+
+      raft::copy(y_no_missing.data(), d_y, n_obs * batch_size, stream);
+      MLCommon::TimeSeries::fillna(y_no_missing.data(), batch_size, n_obs, stream);
+    } else {
+      d_y_no_missing = d_y;
     }
-  }
 
-  // Estimate an ARMA fit without seasonality
-  if (order.p + order.q + order.k)
-    _arma_least_squares(handle,
-                        params.ar,
-                        params.ma,
-                        params.sigma2,
-                        bm_y,
-                        order.p,
-                        order.q,
-                        1,
-                        true,
-                        order.k,
-                        params.mu);
-
-  // Estimate a seasonal ARMA fit independantly
-  if (order.P + order.Q)
-    _arma_least_squares(handle,
-                        params.sar,
-                        params.sma,
-                        params.sigma2,
-                        bm_y,
-                        order.P,
-                        order.Q,
-                        order.s,
-                        order.p + order.q + order.k == 0);
-}
-
-void estimate_x0(raft::handle_t& handle,
-                 ARIMAParams<double>& params,
-                 const double* d_y,
-                 const double* d_exog,
-                 int batch_size,
-                 int n_obs,
-                 const ARIMAOrder& order,
-                 bool missing)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  const auto& handle_impl = handle;
-  auto stream             = handle_impl.get_stream();
-  auto cublas_handle      = handle_impl.get_cublas_handle();
-
-  /// TODO: solve exogenous coefficients with only valid rows instead of interpolation?
-  // Pros: better coefficients
-  // Cons: harder to test, a bit more complicated
-
-  // Least squares can't deal with missing values: create copy with naive
-  // replacements for missing values
-  const double* d_y_no_missing;
-  rmm::device_uvector<double> y_no_missing(0, stream);
-  if (missing) {
-    y_no_missing.resize(n_obs * batch_size, stream);
-    d_y_no_missing = y_no_missing.data();
-
-    raft::copy(y_no_missing.data(), d_y, n_obs * batch_size, stream);
-    MLCommon::TimeSeries::fillna(y_no_missing.data(), batch_size, n_obs, stream);
-  } else {
-    d_y_no_missing = d_y;
-  }
+    // Difference if necessary, copy otherwise
+    MLCommon::LinAlg::Batched::Matrix<double> bm_yd(
+      n_obs - order.d - order.s * order.D, 1, batch_size, cublas_handle, stream, false);
+    MLCommon::TimeSeries::prepare_data(
+      bm_yd.raw_data(), d_y_no_missing, batch_size, n_obs, order.d, order.D, order.s, stream);
 
-  // Difference if necessary, copy otherwise
-  MLCommon::LinAlg::Batched::Matrix<double> bm_yd(
-    n_obs - order.d - order.s * order.D, 1, batch_size, cublas_handle, stream, false);
-  MLCommon::TimeSeries::prepare_data(
-    bm_yd.raw_data(), d_y_no_missing, batch_size, n_obs, order.d, order.D, order.s, stream);
-
-  // Difference or copy exog
-  MLCommon::LinAlg::Batched::Matrix<double> bm_exog_diff(
-    n_obs - order.d - order.s * order.D, order.n_exog, batch_size, cublas_handle, stream, false);
-  if (order.n_exog > 0) {
-    MLCommon::TimeSeries::prepare_data(bm_exog_diff.raw_data(),
-                                       d_exog,
-                                       order.n_exog * batch_size,
-                                       n_obs,
-                                       order.d,
-                                       order.D,
-                                       order.s,
-                                       stream);
-  }
+    // Difference or copy exog
+    MLCommon::LinAlg::Batched::Matrix<double> bm_exog_diff(
+      n_obs - order.d - order.s * order.D, order.n_exog, batch_size, cublas_handle, stream, false);
+    if (order.n_exog > 0) {
+      MLCommon::TimeSeries::prepare_data(bm_exog_diff.raw_data(),
+                                         d_exog,
+                                         order.n_exog * batch_size,
+                                         n_obs,
+                                         order.d,
+                                         order.D,
+                                         order.s,
+                                         stream);
+    }
 
-  // Do the computation of the initial parameters
-  _start_params(handle, params, bm_yd, bm_exog_diff, order);
-}
+    // Do the computation of the initial parameters
+    _start_params(handle, params, bm_yd, bm_exog_diff, order);
+  }
 
 }  // namespace ML
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index 712ddb3640..cd9688ec2e 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -43,207 +43,213 @@
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-namespace ML {
-namespace HDBSCAN {
-namespace detail {
-namespace Reachability {
-
-/**
- * Extract core distances from KNN graph. This is essentially
- * performing a knn_dists[:,min_pts]
- * @tparam value_idx data type for integrals
- * @tparam value_t data type for distance
- * @tparam tpb block size for kernel
- * @param[in] knn_dists knn distance array (size n * k)
- * @param[in] min_samples this neighbor will be selected for core distances
- * @param[in] n_neighbors the number of neighbors of each point in the knn graph
- * @param[in] n number of samples
- * @param[out] out output array (size n)
- * @param[in] stream stream for which to order cuda operations
- */
-template <typename value_idx, typename value_t, int tpb = 256>
-void core_distances(
-  value_t* knn_dists, int min_samples, int n_neighbors, size_t n, value_t* out, cudaStream_t stream)
-{
-  ASSERT(n_neighbors >= min_samples,
-         "the size of the neighborhood should be greater than or equal to min_samples");
-
-  int blocks = raft::ceildiv(n, (size_t)tpb);
-
-  auto exec_policy = rmm::exec_policy(stream);
-
-  auto indices = thrust::make_counting_iterator<value_idx>(0);
-
-  thrust::transform(exec_policy, indices, indices + n, out, [=] __device__(value_idx row) {
-    return knn_dists[row * n_neighbors + (min_samples - 1)];
-  });
-}
-
-/**
- * Wraps the brute force knn API, to be used for both training and prediction
- * @tparam value_idx data type for integrals
- * @tparam value_t data type for distance
- * @param[in] handle raft handle for resource reuse
- * @param[in] X input data points (size m * n)
- * @param[out] inds nearest neighbor indices (size n_search_items * k)
- * @param[out] dists nearest neighbor distances (size n_search_items * k)
- * @param[in] m number of rows in X
- * @param[in] n number of columns in X
- * @param[in] search_items array of items to search of dimensionality D (size n_search_items * n)
- * @param[in] n_search_items number of rows in search_items
- * @param[in] k number of nearest neighbors
- * @param[in] metric distance metric to use
- */
-template <typename value_idx, typename value_t>
-void compute_knn(const raft::handle_t& handle,
-                 const value_t* X,
-                 value_idx* inds,
-                 value_t* dists,
-                 size_t m,
-                 size_t n,
-                 const value_t* search_items,
-                 size_t n_search_items,
-                 int k,
-                 raft::distance::DistanceType metric)
-{
-  auto stream      = handle.get_stream();
-  auto exec_policy = handle.get_thrust_policy();
-  std::vector<value_t*> inputs;
-  inputs.push_back(const_cast<value_t*>(X));
-
-  std::vector<int> sizes;
-  sizes.push_back(m);
-
-  // This is temporary. Once faiss is updated, we should be able to
-  // pass value_idx through to knn.
-  rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
-
-  // perform knn
-  brute_force_knn(handle,
-                  inputs,
-                  sizes,
-                  n,
-                  const_cast<value_t*>(search_items),
-                  n_search_items,
-                  int64_indices.data(),
-                  dists,
-                  k,
-                  true,
-                  true,
-                  metric);
-
-  // convert from current knn's 64-bit to 32-bit.
-  thrust::transform(exec_policy,
-                    int64_indices.data(),
-                    int64_indices.data() + int64_indices.size(),
-                    inds,
-                    [] __device__(int64_t in) -> value_idx { return in; });
-}
-
-/**
- * Constructs a mutual reachability graph, which is a k-nearest neighbors
- * graph projected into mutual reachability space using the following
- * function for each data point, where core_distance is the distance
- * to the kth neighbor: max(core_distance(a), core_distance(b), d(a, b))
- *
- * Unfortunately, points in the tails of the pdf (e.g. in sparse regions
- * of the space) can have very large neighborhoods, which will impact
- * nearby neighborhoods. Because of this, it's possible that the
- * radius for points in the main mass, which might have a very small
- * radius initially, to expand very large. As a result, the initial
- * knn which was used to compute the core distances may no longer
- * capture the actual neighborhoods after projection into mutual
- * reachability space.
- *
- * For the experimental version, we execute the knn twice- once
- * to compute the radii (core distances) and again to capture
- * the final neighborhoods. Future iterations of this algorithm
- * will work improve upon this "exact" version, by using
- * more specialized data structures, such as space-partitioning
- * structures. It has also been shown that approximate nearest
- * neighbors can yield reasonable neighborhoods as the
- * data sizes increase.
- *
- * @tparam value_idx
- * @tparam value_t
- * @param[in] handle raft handle for resource reuse
- * @param[in] X input data points (size m * n)
- * @param[in] m number of rows in X
- * @param[in] n number of columns in X
- * @param[in] metric distance metric to use
- * @param[in] k neighborhood size
- * @param[in] min_samples this neighborhood will be selected for core distances
- * @param[in] alpha weight applied when internal distance is chosen for
- *            mutual reachability (value of 1.0 disables the weighting)
- * @param[out] indptr CSR indptr of output knn graph (size m + 1)
- * @param[out] core_dists output core distances array (size m)
- * @param[out] out COO object, uninitialized on entry, on exit it stores the
- *             (symmetrized) maximum reachability distance for the k nearest
- *             neighbors.
- */
-template <typename value_idx, typename value_t>
-void mutual_reachability_graph(const raft::handle_t& handle,
-                               const value_t* X,
-                               size_t m,
-                               size_t n,
-                               raft::distance::DistanceType metric,
-                               int min_samples,
-                               value_t alpha,
-                               value_idx* indptr,
-                               value_t* core_dists,
-                               raft::sparse::COO<value_t, value_idx>& out)
+namespace ML
 {
-  RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
-               "Currently only L2 expanded distance is supported");
+  namespace HDBSCAN {
+  namespace detail {
+  namespace Reachability {
+
+  /**
+   * Extract core distances from KNN graph. This is essentially
+   * performing a knn_dists[:,min_pts]
+   * @tparam value_idx data type for integrals
+   * @tparam value_t data type for distance
+   * @tparam tpb block size for kernel
+   * @param[in] knn_dists knn distance array (size n * k)
+   * @param[in] min_samples this neighbor will be selected for core distances
+   * @param[in] n_neighbors the number of neighbors of each point in the knn graph
+   * @param[in] n number of samples
+   * @param[out] out output array (size n)
+   * @param[in] stream stream for which to order cuda operations
+   */
+  template <typename value_idx, typename value_t, int tpb = 256>
+  void core_distances(value_t* knn_dists,
+                      int min_samples,
+                      int n_neighbors,
+                      size_t n,
+                      value_t* out,
+                      cudaStream_t stream)
+  {
+    ASSERT(n_neighbors >= min_samples,
+           "the size of the neighborhood should be greater than or equal to min_samples");
 
-  auto stream      = handle.get_stream();
-  auto exec_policy = handle.get_thrust_policy();
+    int blocks = raft::ceildiv(n, (size_t)tpb);
 
-  rmm::device_uvector<value_idx> coo_rows(min_samples * m, stream);
-  rmm::device_uvector<value_idx> inds(min_samples * m, stream);
-  rmm::device_uvector<value_t> dists(min_samples * m, stream);
+    auto exec_policy = rmm::exec_policy(stream);
 
-  // perform knn
-  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
+    auto indices = thrust::make_counting_iterator<value_idx>(0);
 
-  // Slice core distances (distances to kth nearest neighbor)
-  core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
+    thrust::transform(exec_policy, indices, indices + n, out, [=] __device__(value_idx row) {
+      return knn_dists[row * n_neighbors + (min_samples - 1)];
+    });
+  }
+
+  /**
+   * Wraps the brute force knn API, to be used for both training and prediction
+   * @tparam value_idx data type for integrals
+   * @tparam value_t data type for distance
+   * @param[in] handle raft handle for resource reuse
+   * @param[in] X input data points (size m * n)
+   * @param[out] inds nearest neighbor indices (size n_search_items * k)
+   * @param[out] dists nearest neighbor distances (size n_search_items * k)
+   * @param[in] m number of rows in X
+   * @param[in] n number of columns in X
+   * @param[in] search_items array of items to search of dimensionality D (size n_search_items * n)
+   * @param[in] n_search_items number of rows in search_items
+   * @param[in] k number of nearest neighbors
+   * @param[in] metric distance metric to use
+   */
+  template <typename value_idx, typename value_t>
+  void compute_knn(const raft::handle_t& handle,
+                   const value_t* X,
+                   value_idx* inds,
+                   value_t* dists,
+                   size_t m,
+                   size_t n,
+                   const value_t* search_items,
+                   size_t n_search_items,
+                   int k,
+                   raft::distance::DistanceType metric)
+  {
+    auto stream      = handle.get_stream();
+    auto exec_policy = handle.get_thrust_policy();
+    std::vector<value_t*> inputs;
+    inputs.push_back(const_cast<value_t*>(X));
+
+    std::vector<int> sizes;
+    sizes.push_back(m);
+
+    // This is temporary. Once faiss is updated, we should be able to
+    // pass value_idx through to knn.
+    rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
+
+    // perform knn
+    brute_force_knn(handle,
+                    inputs,
+                    sizes,
+                    n,
+                    const_cast<value_t*>(search_items),
+                    n_search_items,
+                    int64_indices.data(),
+                    dists,
+                    k,
+                    true,
+                    true,
+                    metric);
+
+    // convert from current knn's 64-bit to 32-bit.
+    thrust::transform(exec_policy,
+                      int64_indices.data(),
+                      int64_indices.data() + int64_indices.size(),
+                      inds,
+                      [] __device__(int64_t in) -> value_idx { return in; });
+  }
 
   /**
-   * Compute L2 norm
+   * Constructs a mutual reachability graph, which is a k-nearest neighbors
+   * graph projected into mutual reachability space using the following
+   * function for each data point, where core_distance is the distance
+   * to the kth neighbor: max(core_distance(a), core_distance(b), d(a, b))
+   *
+   * Unfortunately, points in the tails of the pdf (e.g. in sparse regions
+   * of the space) can have very large neighborhoods, which will impact
+   * nearby neighborhoods. Because of this, it's possible that the
+   * radius for points in the main mass, which might have a very small
+   * radius initially, to expand very large. As a result, the initial
+   * knn which was used to compute the core distances may no longer
+   * capture the actual neighborhoods after projection into mutual
+   * reachability space.
+   *
+   * For the experimental version, we execute the knn twice- once
+   * to compute the radii (core distances) and again to capture
+   * the final neighborhoods. Future iterations of this algorithm
+   * will work improve upon this "exact" version, by using
+   * more specialized data structures, such as space-partitioning
+   * structures. It has also been shown that approximate nearest
+   * neighbors can yield reasonable neighborhoods as the
+   * data sizes increase.
+   *
+   * @tparam value_idx
+   * @tparam value_t
+   * @param[in] handle raft handle for resource reuse
+   * @param[in] X input data points (size m * n)
+   * @param[in] m number of rows in X
+   * @param[in] n number of columns in X
+   * @param[in] metric distance metric to use
+   * @param[in] k neighborhood size
+   * @param[in] min_samples this neighborhood will be selected for core distances
+   * @param[in] alpha weight applied when internal distance is chosen for
+   *            mutual reachability (value of 1.0 disables the weighting)
+   * @param[out] indptr CSR indptr of output knn graph (size m + 1)
+   * @param[out] core_dists output core distances array (size m)
+   * @param[out] out COO object, uninitialized on entry, on exit it stores the
+   *             (symmetrized) maximum reachability distance for the k nearest
+   *             neighbors.
    */
-  mutual_reachability_knn_l2(
-    handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha);
-
-  // self-loops get max distance
-  auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
-  thrust::transform(exec_policy,
-                    coo_rows_counting_itr,
-                    coo_rows_counting_itr + (m * min_samples),
-                    coo_rows.data(),
-                    [min_samples] __device__(value_idx c) -> value_idx { return c / min_samples; });
-
-  raft::sparse::linalg::symmetrize(
-    handle, coo_rows.data(), inds.data(), dists.data(), m, m, min_samples * m, out);
-
-  raft::sparse::convert::sorted_coo_to_csr(out.rows(), out.nnz, indptr, m + 1, stream);
-
-  // self-loops get max distance
-  auto transform_in =
-    thrust::make_zip_iterator(thrust::make_tuple(out.rows(), out.cols(), out.vals()));
-
-  thrust::transform(exec_policy,
-                    transform_in,
-                    transform_in + out.nnz,
-                    out.vals(),
-                    [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
-                      return thrust::get<0>(tup) == thrust::get<1>(tup)
-                               ? std::numeric_limits<value_t>::max()
-                               : thrust::get<2>(tup);
-                    });
-}
-
-};  // end namespace Reachability
-};  // end namespace detail
-};  // end namespace HDBSCAN
-};  // end namespace ML
+  template <typename value_idx, typename value_t>
+  void mutual_reachability_graph(const raft::handle_t& handle,
+                                 const value_t* X,
+                                 size_t m,
+                                 size_t n,
+                                 raft::distance::DistanceType metric,
+                                 int min_samples,
+                                 value_t alpha,
+                                 value_idx* indptr,
+                                 value_t* core_dists,
+                                 raft::sparse::COO<value_t, value_idx>& out)
+  {
+    RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
+                 "Currently only L2 expanded distance is supported");
+
+    auto stream      = handle.get_stream();
+    auto exec_policy = handle.get_thrust_policy();
+
+    rmm::device_uvector<value_idx> coo_rows(min_samples * m, stream);
+    rmm::device_uvector<value_idx> inds(min_samples * m, stream);
+    rmm::device_uvector<value_t> dists(min_samples * m, stream);
+
+    // perform knn
+    compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
+
+    // Slice core distances (distances to kth nearest neighbor)
+    core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
+
+    /**
+     * Compute L2 norm
+     */
+    mutual_reachability_knn_l2(
+      handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha);
+
+    // self-loops get max distance
+    auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
+    thrust::transform(
+      exec_policy,
+      coo_rows_counting_itr,
+      coo_rows_counting_itr + (m * min_samples),
+      coo_rows.data(),
+      [min_samples] __device__(value_idx c) -> value_idx { return c / min_samples; });
+
+    raft::sparse::linalg::symmetrize(
+      handle, coo_rows.data(), inds.data(), dists.data(), m, m, min_samples * m, out);
+
+    raft::sparse::convert::sorted_coo_to_csr(out.rows(), out.nnz, indptr, m + 1, stream);
+
+    // self-loops get max distance
+    auto transform_in =
+      thrust::make_zip_iterator(thrust::make_tuple(out.rows(), out.cols(), out.vals()));
+
+    thrust::transform(exec_policy,
+                      transform_in,
+                      transform_in + out.nnz,
+                      out.vals(),
+                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
+                        return thrust::get<0>(tup) == thrust::get<1>(tup)
+                                 ? std::numeric_limits<value_t>::max()
+                                 : thrust::get<2>(tup);
+                      });
+  }
+
+  };  // end namespace Reachability
+  };  // end namespace detail
+  };  // end namespace HDBSCAN
+};    // end namespace ML
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index ba3481dadc..88db6858aa 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -39,944 +39,951 @@
 #include <memory>
 #include <set>
 
-namespace ML {
-namespace KNN {
-namespace opg {
-
-namespace knn_common {
-
-/**
- * The enumeration of KNN distributed operations
- */
-enum knn_operation {
-  knn,            /**< Simple KNN */
-  classification, /**< KNN classification */
-  class_proba,    /**< KNN classification probabilities */
-  regression      /**< KNN regression */
-};
-
-/**
- * A structure to store parameters for distributed KNN
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-struct opg_knn_param {
-  opg_knn_param(knn_operation knn_op,
-                std::vector<Matrix::Data<in_t>*>* idx_data,
-                Matrix::PartDescriptor* idx_desc,
-                std::vector<Matrix::Data<in_t>*>* query_data,
-                Matrix::PartDescriptor* query_desc,
-                bool rowMajorIndex,
-                bool rowMajorQuery,
-                size_t k,
-                size_t batch_size,
-                bool verbose)
-  {
-    this->knn_op        = knn_op;
-    this->idx_data      = idx_data;
-    this->idx_desc      = idx_desc;
-    this->query_data    = query_data;
-    this->query_desc    = query_desc;
-    this->rowMajorIndex = rowMajorIndex;
-    this->rowMajorQuery = rowMajorQuery;
-    this->k             = k;
-    this->batch_size    = batch_size;
-    this->verbose       = verbose;
-  }
+namespace ML
+{
+  namespace KNN {
+  namespace opg {
+
+  namespace knn_common {
+
+  /**
+   * The enumeration of KNN distributed operations
+   */
+  enum knn_operation {
+    knn,            /**< Simple KNN */
+    classification, /**< KNN classification */
+    class_proba,    /**< KNN classification probabilities */
+    regression      /**< KNN regression */
+  };
+
+  /**
+   * A structure to store parameters for distributed KNN
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  struct opg_knn_param {
+    opg_knn_param(knn_operation knn_op,
+                  std::vector<Matrix::Data<in_t>*>* idx_data,
+                  Matrix::PartDescriptor* idx_desc,
+                  std::vector<Matrix::Data<in_t>*>* query_data,
+                  Matrix::PartDescriptor* query_desc,
+                  bool rowMajorIndex,
+                  bool rowMajorQuery,
+                  size_t k,
+                  size_t batch_size,
+                  bool verbose)
+    {
+      this->knn_op        = knn_op;
+      this->idx_data      = idx_data;
+      this->idx_desc      = idx_desc;
+      this->query_data    = query_data;
+      this->query_desc    = query_desc;
+      this->rowMajorIndex = rowMajorIndex;
+      this->rowMajorQuery = rowMajorQuery;
+      this->k             = k;
+      this->batch_size    = batch_size;
+      this->verbose       = verbose;
+    }
 
-  knn_operation knn_op;                                   /**< Type of KNN distributed operation */
-  std::vector<Matrix::Data<dist_t>*>* out_D    = nullptr; /**< KNN distances output array */
-  std::vector<Matrix::Data<ind_t>*>* out_I     = nullptr; /**< KNN indices output array */
-  std::vector<Matrix::Data<in_t>*>* idx_data   = nullptr; /**< Index input array */
-  Matrix::PartDescriptor* idx_desc             = nullptr; /**< Descriptor for index input array */
-  std::vector<Matrix::Data<in_t>*>* query_data = nullptr; /**< Query input array */
-  Matrix::PartDescriptor* query_desc           = nullptr; /**< Descriptor for query input array */
-  bool rowMajorIndex;                                     /**< Is index row major? */
-  bool rowMajorQuery;                                     /**< Is query row major? */
-  size_t k          = 0;                                  /**< Number of nearest neighbors */
-  size_t batch_size = 0;                                  /**< Batch size */
-  bool verbose;                                           /**< verbose */
-
-  std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
-  std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
-  std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
-
-  std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
-  std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */
-  std::vector<std::vector<float*>>* probas =
-    nullptr; /**< KNN classification probabilities output array (class-probas) */
-};
-
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-struct KNN_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-  KNN_params(knn_operation knn_op,
-             std::vector<Matrix::Data<in_t>*>* idx_data,
-             Matrix::PartDescriptor* idx_desc,
-             std::vector<Matrix::Data<in_t>*>* query_data,
-             Matrix::PartDescriptor* query_desc,
-             bool rowMajorIndex,
-             bool rowMajorQuery,
-             size_t k,
-             size_t batch_size,
-             bool verbose,
-             std::vector<Matrix::Data<dist_t>*>* out_D,
-             std::vector<Matrix::Data<ind_t>*>* out_I)
-    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                idx_data,
-                                                idx_desc,
-                                                query_data,
-                                                query_desc,
-                                                rowMajorIndex,
-                                                rowMajorQuery,
-                                                k,
-                                                batch_size,
-                                                verbose)
-  {
-    this->out_D = out_D;
-    this->out_I = out_I;
-  }
-};
-
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-struct KNN_RE_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-  KNN_RE_params(knn_operation knn_op,
-                std::vector<Matrix::Data<in_t>*>* idx_data,
-                Matrix::PartDescriptor* idx_desc,
-                std::vector<Matrix::Data<in_t>*>* query_data,
-                Matrix::PartDescriptor* query_desc,
-                bool rowMajorIndex,
-                bool rowMajorQuery,
-                size_t k,
-                size_t batch_size,
-                bool verbose,
-                std::size_t n_outputs,
-                std::vector<std::vector<out_t*>>* y,
-                std::vector<Matrix::Data<out_t>*>* out)
-    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                idx_data,
-                                                idx_desc,
-                                                query_data,
-                                                query_desc,
-                                                rowMajorIndex,
-                                                rowMajorQuery,
-                                                k,
-                                                batch_size,
-                                                verbose)
-  {
-    this->n_outputs = n_outputs;
-    this->y         = y;
-    this->out       = out;
-  }
-};
-
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-struct KNN_CL_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-  KNN_CL_params(knn_operation knn_op,
-                std::vector<Matrix::Data<in_t>*>* idx_data,
-                Matrix::PartDescriptor* idx_desc,
-                std::vector<Matrix::Data<in_t>*>* query_data,
-                Matrix::PartDescriptor* query_desc,
-                bool rowMajorIndex,
-                bool rowMajorQuery,
-                size_t k,
-                size_t batch_size,
-                bool verbose,
-                std::size_t n_outputs,
-                std::vector<std::vector<out_t*>>* y,
-                std::vector<int>* n_unique,
-                std::vector<out_t*>* uniq_labels,
-                std::vector<Matrix::Data<out_t>*>* out,
-                std::vector<std::vector<float*>>* probas)
-    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                idx_data,
-                                                idx_desc,
-                                                query_data,
-                                                query_desc,
-                                                rowMajorIndex,
-                                                rowMajorQuery,
-                                                k,
-                                                batch_size,
-                                                verbose)
-  {
-    this->n_outputs   = n_outputs;
-    this->y           = y;
-    this->n_unique    = n_unique;
-    this->uniq_labels = uniq_labels;
-    this->out         = out;
-    this->probas      = probas;
-  }
-};
+    knn_operation knn_op; /**< Type of KNN distributed operation */
+    std::vector<Matrix::Data<dist_t>*>* out_D    = nullptr; /**< KNN distances output array */
+    std::vector<Matrix::Data<ind_t>*>* out_I     = nullptr; /**< KNN indices output array */
+    std::vector<Matrix::Data<in_t>*>* idx_data   = nullptr; /**< Index input array */
+    Matrix::PartDescriptor* idx_desc             = nullptr; /**< Descriptor for index input array */
+    std::vector<Matrix::Data<in_t>*>* query_data = nullptr; /**< Query input array */
+    Matrix::PartDescriptor* query_desc           = nullptr; /**< Descriptor for query input array */
+    bool rowMajorIndex;                                     /**< Is index row major? */
+    bool rowMajorQuery;                                     /**< Is query row major? */
+    size_t k          = 0;                                  /**< Number of nearest neighbors */
+    size_t batch_size = 0;                                  /**< Batch size */
+    bool verbose;                                           /**< verbose */
+
+    std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
+    std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
+    std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
+
+    std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
+    std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */
+    std::vector<std::vector<float*>>* probas =
+      nullptr; /**< KNN classification probabilities output array (class-probas) */
+  };
+
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  struct KNN_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+    KNN_params(knn_operation knn_op,
+               std::vector<Matrix::Data<in_t>*>* idx_data,
+               Matrix::PartDescriptor* idx_desc,
+               std::vector<Matrix::Data<in_t>*>* query_data,
+               Matrix::PartDescriptor* query_desc,
+               bool rowMajorIndex,
+               bool rowMajorQuery,
+               size_t k,
+               size_t batch_size,
+               bool verbose,
+               std::vector<Matrix::Data<dist_t>*>* out_D,
+               std::vector<Matrix::Data<ind_t>*>* out_I)
+      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                  idx_data,
+                                                  idx_desc,
+                                                  query_data,
+                                                  query_desc,
+                                                  rowMajorIndex,
+                                                  rowMajorQuery,
+                                                  k,
+                                                  batch_size,
+                                                  verbose)
+    {
+      this->out_D = out_D;
+      this->out_I = out_I;
+    }
+  };
+
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  struct KNN_RE_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+    KNN_RE_params(knn_operation knn_op,
+                  std::vector<Matrix::Data<in_t>*>* idx_data,
+                  Matrix::PartDescriptor* idx_desc,
+                  std::vector<Matrix::Data<in_t>*>* query_data,
+                  Matrix::PartDescriptor* query_desc,
+                  bool rowMajorIndex,
+                  bool rowMajorQuery,
+                  size_t k,
+                  size_t batch_size,
+                  bool verbose,
+                  std::size_t n_outputs,
+                  std::vector<std::vector<out_t*>>* y,
+                  std::vector<Matrix::Data<out_t>*>* out)
+      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                  idx_data,
+                                                  idx_desc,
+                                                  query_data,
+                                                  query_desc,
+                                                  rowMajorIndex,
+                                                  rowMajorQuery,
+                                                  k,
+                                                  batch_size,
+                                                  verbose)
+    {
+      this->n_outputs = n_outputs;
+      this->y         = y;
+      this->out       = out;
+    }
+  };
+
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  struct KNN_CL_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+    KNN_CL_params(knn_operation knn_op,
+                  std::vector<Matrix::Data<in_t>*>* idx_data,
+                  Matrix::PartDescriptor* idx_desc,
+                  std::vector<Matrix::Data<in_t>*>* query_data,
+                  Matrix::PartDescriptor* query_desc,
+                  bool rowMajorIndex,
+                  bool rowMajorQuery,
+                  size_t k,
+                  size_t batch_size,
+                  bool verbose,
+                  std::size_t n_outputs,
+                  std::vector<std::vector<out_t*>>* y,
+                  std::vector<int>* n_unique,
+                  std::vector<out_t*>* uniq_labels,
+                  std::vector<Matrix::Data<out_t>*>* out,
+                  std::vector<std::vector<float*>>* probas)
+      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                  idx_data,
+                                                  idx_desc,
+                                                  query_data,
+                                                  query_desc,
+                                                  rowMajorIndex,
+                                                  rowMajorQuery,
+                                                  k,
+                                                  batch_size,
+                                                  verbose)
+    {
+      this->n_outputs   = n_outputs;
+      this->y           = y;
+      this->n_unique    = n_unique;
+      this->uniq_labels = uniq_labels;
+      this->out         = out;
+      this->probas      = probas;
+    }
+  };
+
+  /**
+   * A structure to store utilities for distributed KNN operations
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  struct opg_knn_work {
+    opg_knn_work(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
+      : res_D(0, handle.get_stream()), res_I(0, handle.get_stream()), res(0, handle.get_stream())
+    {
+      this->my_rank           = handle.get_comms().get_rank();
+      this->idxRanks          = params.idx_desc->uniqueRanks();
+      this->idxPartsToRanks   = params.idx_desc->partsToRanks;
+      this->local_idx_parts   = params.idx_desc->blocksOwnedBy(handle.get_comms().get_rank());
+      this->queryPartsToRanks = params.query_desc->partsToRanks;
+    }
 
-/**
- * A structure to store utilities for distributed KNN operations
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-struct opg_knn_work {
-  opg_knn_work(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
-    : res_D(0, handle.get_stream()), res_I(0, handle.get_stream()), res(0, handle.get_stream())
+    int my_rank;            /**< Rank of this worker */
+    std::set<int> idxRanks; /**< Set of ranks having at least 1 index partition */
+    std::vector<Matrix::RankSizePair*> idxPartsToRanks;   /**< Index parts to rank */
+    std::vector<Matrix::RankSizePair*> local_idx_parts;   /**< List of index parts stored locally */
+    std::vector<Matrix::RankSizePair*> queryPartsToRanks; /**< Query parts to rank */
+
+    rmm::device_uvector<dist_t> res_D; /**< Temporary allocation to exchange distances */
+    rmm::device_uvector<ind_t> res_I;  /**< Temporary allocation to exchange indices */
+    rmm::device_uvector<out_t> res;    /**< Temporary allocation to exchange outputs (cl&re) */
+  };
+
+  /*!
+   Main function, computes distributed KNN operation
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] handle RAFT handle
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  void opg_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
   {
-    this->my_rank           = handle.get_comms().get_rank();
-    this->idxRanks          = params.idx_desc->uniqueRanks();
-    this->idxPartsToRanks   = params.idx_desc->partsToRanks;
-    this->local_idx_parts   = params.idx_desc->blocksOwnedBy(handle.get_comms().get_rank());
-    this->queryPartsToRanks = params.query_desc->partsToRanks;
-  }
+    opg_knn_work<in_t, ind_t, dist_t, out_t> work(params, handle);
+
+    ASSERT(params.k <= 1024, "k must be <= 1024");
+    ASSERT(params.batch_size > 0, "max_batch_size must be > 0");
+    ASSERT(params.k < params.idx_desc->M, "k must be less than the total number of query rows");
+    for (Matrix::RankSizePair* rsp : work.idxPartsToRanks) {
+      ASSERT(rsp->size >= params.k,
+             "k must be <= the number of rows in the smallest index partition.");
+    }
 
-  int my_rank;            /**< Rank of this worker */
-  std::set<int> idxRanks; /**< Set of ranks having at least 1 index partition */
-  std::vector<Matrix::RankSizePair*> idxPartsToRanks;   /**< Index parts to rank */
-  std::vector<Matrix::RankSizePair*> local_idx_parts;   /**< List of index parts stored locally */
-  std::vector<Matrix::RankSizePair*> queryPartsToRanks; /**< Query parts to rank */
-
-  rmm::device_uvector<dist_t> res_D; /**< Temporary allocation to exchange distances */
-  rmm::device_uvector<ind_t> res_I;  /**< Temporary allocation to exchange indices */
-  rmm::device_uvector<out_t> res;    /**< Temporary allocation to exchange outputs (cl&re) */
-};
-
-/*!
- Main function, computes distributed KNN operation
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] handle RAFT handle
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-void opg_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
-{
-  opg_knn_work<in_t, ind_t, dist_t, out_t> work(params, handle);
-
-  ASSERT(params.k <= 1024, "k must be <= 1024");
-  ASSERT(params.batch_size > 0, "max_batch_size must be > 0");
-  ASSERT(params.k < params.idx_desc->M, "k must be less than the total number of query rows");
-  for (Matrix::RankSizePair* rsp : work.idxPartsToRanks) {
-    ASSERT(rsp->size >= params.k,
-           "k must be <= the number of rows in the smallest index partition.");
-  }
+    int local_parts_completed = 0;
+    // Loop through query parts for all ranks
+    for (int i = 0; i < params.query_desc->totalBlocks(); i++) {  // For each query partitions
+      Matrix::RankSizePair* partition = work.queryPartsToRanks[i];
+      int part_rank                   = partition->rank;
+      size_t part_n_rows              = partition->size;
 
-  int local_parts_completed = 0;
-  // Loop through query parts for all ranks
-  for (int i = 0; i < params.query_desc->totalBlocks(); i++) {  // For each query partitions
-    Matrix::RankSizePair* partition = work.queryPartsToRanks[i];
-    int part_rank                   = partition->rank;
-    size_t part_n_rows              = partition->size;
+      size_t total_batches     = raft::ceildiv(part_n_rows, params.batch_size);
+      size_t total_n_processed = 0;
 
-    size_t total_batches     = raft::ceildiv(part_n_rows, params.batch_size);
-    size_t total_n_processed = 0;
+      // For each batch in a query partition
+      for (std::size_t cur_batch = 0; cur_batch < total_batches; cur_batch++) {
+        size_t cur_batch_size = params.batch_size;
 
-    // For each batch in a query partition
-    for (std::size_t cur_batch = 0; cur_batch < total_batches; cur_batch++) {
-      size_t cur_batch_size = params.batch_size;
+        if (cur_batch == total_batches - 1)
+          cur_batch_size = part_n_rows - (cur_batch * params.batch_size);
 
-      if (cur_batch == total_batches - 1)
-        cur_batch_size = part_n_rows - (cur_batch * params.batch_size);
+        if (work.my_rank == part_rank) CUML_LOG_DEBUG("Root Rank is %d", work.my_rank);
 
-      if (work.my_rank == part_rank) CUML_LOG_DEBUG("Root Rank is %d", work.my_rank);
+        /**
+         * Root broadcasts batch to all other ranks
+         */
+        CUML_LOG_DEBUG("Rank %d: Performing Broadcast", work.my_rank);
 
-      /**
-       * Root broadcasts batch to all other ranks
-       */
-      CUML_LOG_DEBUG("Rank %d: Performing Broadcast", work.my_rank);
+        rmm::device_uvector<in_t> part_data(0, handle.get_stream());
 
-      rmm::device_uvector<in_t> part_data(0, handle.get_stream());
+        size_t batch_input_elms   = cur_batch_size * params.query_desc->N;
+        size_t batch_input_offset = batch_input_elms * cur_batch;
 
-      size_t batch_input_elms   = cur_batch_size * params.query_desc->N;
-      size_t batch_input_offset = batch_input_elms * cur_batch;
+        in_t* cur_query_ptr{nullptr};
 
-      in_t* cur_query_ptr{nullptr};
+        rmm::device_uvector<in_t> tmp_batch_buf(0, handle.get_stream());
+        // current partition's owner rank broadcasts
+        if (part_rank == work.my_rank) {
+          Matrix::Data<in_t>* data = params.query_data->at(local_parts_completed);
 
-      rmm::device_uvector<in_t> tmp_batch_buf(0, handle.get_stream());
-      // current partition's owner rank broadcasts
-      if (part_rank == work.my_rank) {
-        Matrix::Data<in_t>* data = params.query_data->at(local_parts_completed);
+          // If query is column major and total_batches > 0, create a
+          // temporary buffer for the batch so that we can stack rows.
+          if (!params.rowMajorQuery && total_batches > 1) {
+            tmp_batch_buf.resize(batch_input_elms, handle.get_stream());
+            for (std::size_t col_data = 0; col_data < params.query_desc->N; col_data++) {
+              raft::copy(tmp_batch_buf.data() + (col_data * cur_batch_size),
+                         data->ptr + ((col_data * part_n_rows) + total_n_processed),
+                         cur_batch_size,
+                         handle.get_stream());
+            }
+            cur_query_ptr = tmp_batch_buf.data();
 
-        // If query is column major and total_batches > 0, create a
-        // temporary buffer for the batch so that we can stack rows.
-        if (!params.rowMajorQuery && total_batches > 1) {
-          tmp_batch_buf.resize(batch_input_elms, handle.get_stream());
-          for (std::size_t col_data = 0; col_data < params.query_desc->N; col_data++) {
-            raft::copy(tmp_batch_buf.data() + (col_data * cur_batch_size),
-                       data->ptr + ((col_data * part_n_rows) + total_n_processed),
-                       cur_batch_size,
-                       handle.get_stream());
+          } else {
+            cur_query_ptr = data->ptr + batch_input_offset;
           }
-          cur_query_ptr = tmp_batch_buf.data();
 
-        } else {
-          cur_query_ptr = data->ptr + batch_input_offset;
+          // all other (index) ranks receive
+        } else if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
+          part_data.resize(batch_input_elms, handle.get_stream());
+          cur_query_ptr = part_data.data();
         }
 
-        // all other (index) ranks receive
-      } else if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
-        part_data.resize(batch_input_elms, handle.get_stream());
-        cur_query_ptr = part_data.data();
-      }
+        bool my_rank_is_idx = work.idxRanks.find(work.my_rank) != work.idxRanks.end();
 
-      bool my_rank_is_idx = work.idxRanks.find(work.my_rank) != work.idxRanks.end();
-
-      /**
-       * Send query to index partitions
-       */
-      if (work.my_rank == part_rank || my_rank_is_idx)
-        broadcast_query(work, handle, part_rank, cur_query_ptr, batch_input_elms);
-
-      if (my_rank_is_idx) {
         /**
-         * All index ranks perform local KNN
+         * Send query to index partitions
          */
-        CUML_LOG_DEBUG("Rank %d: Performing Local KNN", work.my_rank);
+        if (work.my_rank == part_rank || my_rank_is_idx)
+          broadcast_query(work, handle, part_rank, cur_query_ptr, batch_input_elms);
 
-        size_t batch_knn_elms = params.k * cur_batch_size;
+        if (my_rank_is_idx) {
+          /**
+           * All index ranks perform local KNN
+           */
+          CUML_LOG_DEBUG("Rank %d: Performing Local KNN", work.my_rank);
 
-        if (params.knn_op != knn_operation::knn) {
-          // No labels for KNN only operation
-          work.res.resize(batch_knn_elms * params.n_outputs, handle.get_stream());
-        }
-        work.res_I.resize(batch_knn_elms, handle.get_stream());
-        work.res_D.resize(batch_knn_elms, handle.get_stream());
+          size_t batch_knn_elms = params.k * cur_batch_size;
+
+          if (params.knn_op != knn_operation::knn) {
+            // No labels for KNN only operation
+            work.res.resize(batch_knn_elms * params.n_outputs, handle.get_stream());
+          }
+          work.res_I.resize(batch_knn_elms, handle.get_stream());
+          work.res_D.resize(batch_knn_elms, handle.get_stream());
 
-        // Perform a local KNN search
-        perform_local_knn(params, work, handle, cur_query_ptr, cur_batch_size);
+          // Perform a local KNN search
+          perform_local_knn(params, work, handle, cur_query_ptr, cur_batch_size);
 
-        if (params.knn_op != knn_operation::knn) {
-          // Get the right labels for indices obtained after a KNN merge
-          copy_label_outputs_from_index_parts(params, work, handle, cur_batch_size);
+          if (params.knn_op != knn_operation::knn) {
+            // Get the right labels for indices obtained after a KNN merge
+            copy_label_outputs_from_index_parts(params, work, handle, cur_batch_size);
+          }
+        }
+
+        if (part_rank == work.my_rank || my_rank_is_idx) {
+          /**
+           * Ranks exchange results.
+           * Each rank having index partition(s) sends
+           * its local results (my_rank_is_idx)
+           * Additionally the owner of currently processed query partition
+           * receives and performs a reduce even if it has
+           * no index partition (part_rank == my_rank)
+           */
+          CUML_LOG_DEBUG("Rank %d: Exchanging results", work.my_rank);
+          exchange_results(params, work, handle, part_rank, cur_batch_size);
         }
-      }
 
-      if (part_rank == work.my_rank || my_rank_is_idx) {
         /**
-         * Ranks exchange results.
-         * Each rank having index partition(s) sends
-         * its local results (my_rank_is_idx)
-         * Additionally the owner of currently processed query partition
-         * receives and performs a reduce even if it has
-         * no index partition (part_rank == my_rank)
+         * Root rank performs local reduce
          */
-        CUML_LOG_DEBUG("Rank %d: Exchanging results", work.my_rank);
-        exchange_results(params, work, handle, part_rank, cur_batch_size);
-      }
+        if (part_rank == work.my_rank) {
+          CUML_LOG_DEBUG("Rank %d: Performing Reduce", work.my_rank);
 
-      /**
-       * Root rank performs local reduce
-       */
-      if (part_rank == work.my_rank) {
-        CUML_LOG_DEBUG("Rank %d: Performing Reduce", work.my_rank);
+          // Reduce all local results to a global result for a given query batch
+          reduce(params, work, handle, local_parts_completed, total_n_processed, cur_batch_size);
 
-        // Reduce all local results to a global result for a given query batch
-        reduce(params, work, handle, local_parts_completed, total_n_processed, cur_batch_size);
+          CUML_LOG_DEBUG("Rank %d: Finished Reduce", work.my_rank);
+        }
 
-        CUML_LOG_DEBUG("Rank %d: Finished Reduce", work.my_rank);
+        total_n_processed += cur_batch_size;
       }
 
-      total_n_processed += cur_batch_size;
+      if (work.my_rank == part_rank) local_parts_completed++;
     }
+  };
+
+  /*!
+   Broadcast query batch accross all the workers
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] handle RAFT handle
+   @param[in] part_rank Rank of currently processed query batch
+   @param[in] broadcast Pointer to broadcast
+   @param[in] broadcast_size Size of broadcast
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  void broadcast_query(opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                       raft::handle_t& handle,
+                       int part_rank,
+                       in_t* broadcast,
+                       size_t broadcast_size)
+  {
+    int request_idx = 0;
+    std::vector<raft::comms::request_t> requests;
+    if (part_rank == work.my_rank) {  // Either broadcast to other workers
+      int idx_rank_size = work.idxRanks.size();
+      if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) { --idx_rank_size; }
 
-    if (work.my_rank == part_rank) local_parts_completed++;
-  }
-};
-
-/*!
- Broadcast query batch accross all the workers
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] handle RAFT handle
- @param[in] part_rank Rank of currently processed query batch
- @param[in] broadcast Pointer to broadcast
- @param[in] broadcast_size Size of broadcast
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-void broadcast_query(opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                     raft::handle_t& handle,
-                     int part_rank,
-                     in_t* broadcast,
-                     size_t broadcast_size)
-{
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank == work.my_rank) {  // Either broadcast to other workers
-    int idx_rank_size = work.idxRanks.size();
-    if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) { --idx_rank_size; }
-
-    requests.resize(idx_rank_size);
-
-    for (int rank : work.idxRanks) {
-      if (rank != work.my_rank) {
-        handle.get_comms().isend(broadcast, broadcast_size, rank, 0, requests.data() + request_idx);
-        ++request_idx;
+      requests.resize(idx_rank_size);
+
+      for (int rank : work.idxRanks) {
+        if (rank != work.my_rank) {
+          handle.get_comms().isend(
+            broadcast, broadcast_size, rank, 0, requests.data() + request_idx);
+          ++request_idx;
+        }
       }
+
+    } else {  // Or receive from broadcaster
+      requests.resize(1);
+      handle.get_comms().irecv(
+        broadcast, broadcast_size, part_rank, 0, requests.data() + request_idx);
+      ++request_idx;
     }
 
-  } else {  // Or receive from broadcaster
-    requests.resize(1);
-    handle.get_comms().irecv(
-      broadcast, broadcast_size, part_rank, 0, requests.data() + request_idx);
-    ++request_idx;
+    try {
+      handle.get_comms().waitall(requests.size(), requests.data());
+    } catch (raft::exception& e) {
+      CUML_LOG_DEBUG("FAILURE!");
+    }
   }
 
-  try {
-    handle.get_comms().waitall(requests.size(), requests.data());
-  } catch (raft::exception& e) {
-    CUML_LOG_DEBUG("FAILURE!");
-  }
-}
-
-/*!
- Perform a local KNN search for a given query batch
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[in] query Pointer to query
- @param[in] query_size Size of query
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                       opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                       raft::handle_t& handle,
-                       in_t* query,
-                       size_t query_size)
-{
-  std::vector<in_t*> ptrs(params.idx_data->size());
-  std::vector<std::size_t> sizes(params.idx_data->size());
+  /*!
+   Perform a local KNN search for a given query batch
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[in] query Pointer to query
+   @param[in] query_size Size of query
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                         opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                         raft::handle_t& handle,
+                         in_t* query,
+                         size_t query_size)
+  {
+    std::vector<in_t*> ptrs(params.idx_data->size());
+    std::vector<std::size_t> sizes(params.idx_data->size());
 
-  for (std::size_t cur_idx = 0; cur_idx < params.idx_data->size(); cur_idx++) {
-    ptrs[cur_idx]  = params.idx_data->at(cur_idx)->ptr;
-    sizes[cur_idx] = work.local_idx_parts[cur_idx]->size;
-  }
+    for (std::size_t cur_idx = 0; cur_idx < params.idx_data->size(); cur_idx++) {
+      ptrs[cur_idx]  = params.idx_data->at(cur_idx)->ptr;
+      sizes[cur_idx] = work.local_idx_parts[cur_idx]->size;
+    }
 
-  // Offset nearest neighbor index matrix by partition indices
-  std::vector<size_t> start_indices = params.idx_desc->startIndices(work.my_rank);
-  // PartDescriptor uses size_t while FAISS uses int64_t
-  // so we need to do a quick conversion.
-  std::vector<int64_t> start_indices_long;
-  for (size_t start_index : start_indices)
-    start_indices_long.push_back((int64_t)start_index);
-
-  // ID ranges need to be offset by each local partition's
-  // starting indices.
-  raft::spatial::knn::brute_force_knn<std::int64_t, float, std::size_t>(
-    handle,
-    ptrs,
-    sizes,
-    params.idx_desc->N,
-    query,
-    query_size,
-    work.res_I.data(),
-    work.res_D.data(),
-    params.k,
-    params.rowMajorIndex,
-    params.rowMajorQuery,
-    &start_indices_long,
-    raft::distance::DistanceType::L2SqrtExpanded);
-  handle.sync_stream(handle.get_stream());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-/**
- * This function copies the labels associated to the locally merged indices
- * from the index partitions to a merged array of labels
- * @param[out] out merged labels
- * @param[in] knn_indices merged indices
- * @param[in] parts unmerged labels in partitions
- * @param[in] offsets array splitting the partitions making it possible
- * to identify the origin partition of an nearest neighbor index
- * @param[in] cur_batch_size current batch size
- * @param[in] n_parts number of partitions
- * @param[in] n_labels number of labels to write (batch_size * n_outputs)
- */
-template <int TPB_X, typename ind_t, typename out_t>
-__global__ void copy_label_outputs_from_index_parts_kernel(out_t* out,
-                                                           ind_t* knn_indices,
-                                                           out_t** parts,
-                                                           uint64_t* offsets,
-                                                           size_t cur_batch_size,
-                                                           int n_parts,
-                                                           int n_labels)
-{
-  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (i >= n_labels) return;
-  uint64_t nn_idx = knn_indices[i];
-  int part_idx    = 0;
-  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
-  part_idx        = std::min(std::max(0, part_idx - 1), n_parts - 1);
-  uint64_t offset = nn_idx - offsets[part_idx];
-  out[i]          = parts[part_idx][offset];
-}
-
-/*!
- Get the right labels for indices obtained after a KNN merge
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[in] batch_size Batch size
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-void copy_label_outputs_from_index_parts(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                                         opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                                         raft::handle_t& handle,
-                                         size_t batch_size)
-{
-  const int TPB_X = 256;
-  int n_labels    = batch_size * params.k;
-  dim3 grid(raft::ceildiv(n_labels, TPB_X));
-  dim3 blk(TPB_X);
-
-  uint64_t offset = 0;
-  std::vector<uint64_t> offsets_h;
-  for (auto& rsp : work.idxPartsToRanks) {
-    if (rsp->rank == work.my_rank) { offsets_h.push_back(offset); }
-    offset += rsp->size;
+    // Offset nearest neighbor index matrix by partition indices
+    std::vector<size_t> start_indices = params.idx_desc->startIndices(work.my_rank);
+    // PartDescriptor uses size_t while FAISS uses int64_t
+    // so we need to do a quick conversion.
+    std::vector<int64_t> start_indices_long;
+    for (size_t start_index : start_indices)
+      start_indices_long.push_back((int64_t)start_index);
+
+    // ID ranges need to be offset by each local partition's
+    // starting indices.
+    raft::spatial::knn::brute_force_knn<std::int64_t, float, std::size_t>(
+      handle,
+      ptrs,
+      sizes,
+      params.idx_desc->N,
+      query,
+      query_size,
+      work.res_I.data(),
+      work.res_D.data(),
+      params.k,
+      params.rowMajorIndex,
+      params.rowMajorQuery,
+      &start_indices_long,
+      raft::distance::DistanceType::L2SqrtExpanded);
+    handle.sync_stream(handle.get_stream());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
-  std::size_t n_parts = offsets_h.size();
-  rmm::device_uvector<uint64_t> offsets_d(n_parts, handle.get_stream());
-  raft::update_device(offsets_d.data(), offsets_h.data(), n_parts, handle.get_stream());
-
-  std::vector<out_t*> parts_h(n_parts);
-  rmm::device_uvector<out_t*> parts_d(n_parts, handle.get_stream());
-  for (std::size_t o = 0; o < params.n_outputs; o++) {
-    for (std::size_t p = 0; p < n_parts; p++) {
-      parts_h[p] = params.y->at(p)[o];
-    }
-    raft::update_device(parts_d.data(), parts_h.data(), n_parts, handle.get_stream());
-
-    copy_label_outputs_from_index_parts_kernel<TPB_X, ind_t, out_t>
-      <<<grid, blk, 0, handle.get_stream()>>>(work.res.data() + (o * n_labels),
-                                              work.res_I.data(),
-                                              parts_d.data(),
-                                              offsets_d.data(),
-                                              batch_size,
-                                              n_parts,
-                                              n_labels);
+
+  /**
+   * This function copies the labels associated to the locally merged indices
+   * from the index partitions to a merged array of labels
+   * @param[out] out merged labels
+   * @param[in] knn_indices merged indices
+   * @param[in] parts unmerged labels in partitions
+   * @param[in] offsets array splitting the partitions making it possible
+   * to identify the origin partition of an nearest neighbor index
+   * @param[in] cur_batch_size current batch size
+   * @param[in] n_parts number of partitions
+   * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+   */
+  template <int TPB_X, typename ind_t, typename out_t>
+  __global__ void copy_label_outputs_from_index_parts_kernel(out_t* out,
+                                                             ind_t* knn_indices,
+                                                             out_t** parts,
+                                                             uint64_t* offsets,
+                                                             size_t cur_batch_size,
+                                                             int n_parts,
+                                                             int n_labels)
+  {
+    uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+    if (i >= n_labels) return;
+    uint64_t nn_idx = knn_indices[i];
+    int part_idx    = 0;
+    for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
+    part_idx        = std::min(std::max(0, part_idx - 1), n_parts - 1);
+    uint64_t offset = nn_idx - offsets[part_idx];
+    out[i]          = parts[part_idx][offset];
   }
-  handle.sync_stream(handle.get_stream());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-/*!
- Exchange results of local KNN search and operation for a given query batch
- All non-root index ranks send the results for the current
- query batch to the root rank for the batch.
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[in] part_rank Rank of currently processed query batch
- @param[in] batch_size Batch size
- */
-template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-void exchange_results(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                      opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                      raft::handle_t& handle,
-                      int part_rank,
-                      size_t batch_size)
-{
-  size_t batch_elms = batch_size * params.k;
 
-  int request_idx = 0;
-  std::vector<raft::comms::request_t> requests;
-  if (part_rank != work.my_rank) {  // Either send local KNN results
-    requests.resize(2);
-    handle.get_comms().isend(
-      work.res_I.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
-    ++request_idx;
+  /*!
+   Get the right labels for indices obtained after a KNN merge
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[in] batch_size Batch size
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  void copy_label_outputs_from_index_parts(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                                           opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                                           raft::handle_t& handle,
+                                           size_t batch_size)
+  {
+    const int TPB_X = 256;
+    int n_labels    = batch_size * params.k;
+    dim3 grid(raft::ceildiv(n_labels, TPB_X));
+    dim3 blk(TPB_X);
+
+    uint64_t offset = 0;
+    std::vector<uint64_t> offsets_h;
+    for (auto& rsp : work.idxPartsToRanks) {
+      if (rsp->rank == work.my_rank) { offsets_h.push_back(offset); }
+      offset += rsp->size;
+    }
+    std::size_t n_parts = offsets_h.size();
+    rmm::device_uvector<uint64_t> offsets_d(n_parts, handle.get_stream());
+    raft::update_device(offsets_d.data(), offsets_h.data(), n_parts, handle.get_stream());
+
+    std::vector<out_t*> parts_h(n_parts);
+    rmm::device_uvector<out_t*> parts_d(n_parts, handle.get_stream());
+    for (std::size_t o = 0; o < params.n_outputs; o++) {
+      for (std::size_t p = 0; p < n_parts; p++) {
+        parts_h[p] = params.y->at(p)[o];
+      }
+      raft::update_device(parts_d.data(), parts_h.data(), n_parts, handle.get_stream());
 
-    handle.get_comms().isend(
-      work.res_D.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
-    ++request_idx;
+      copy_label_outputs_from_index_parts_kernel<TPB_X, ind_t, out_t>
+        <<<grid, blk, 0, handle.get_stream()>>>(work.res.data() + (o * n_labels),
+                                                work.res_I.data(),
+                                                parts_d.data(),
+                                                offsets_d.data(),
+                                                batch_size,
+                                                n_parts,
+                                                n_labels);
+    }
+    handle.sync_stream(handle.get_stream());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 
-    if (params.knn_op != knn_operation::knn) {
-      requests.resize(2 + params.n_outputs);
-      for (std::size_t o = 0; o < params.n_outputs; o++) {
-        handle.get_comms().isend(work.res.data() + (o * batch_elms),
-                                 batch_elms,
-                                 part_rank,
-                                 0,
-                                 requests.data() + request_idx);
-        ++request_idx;
+  /*!
+   Exchange results of local KNN search and operation for a given query batch
+   All non-root index ranks send the results for the current
+   query batch to the root rank for the batch.
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[in] part_rank Rank of currently processed query batch
+   @param[in] batch_size Batch size
+   */
+  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+  void exchange_results(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                        opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                        raft::handle_t& handle,
+                        int part_rank,
+                        size_t batch_size)
+  {
+    size_t batch_elms = batch_size * params.k;
+
+    int request_idx = 0;
+    std::vector<raft::comms::request_t> requests;
+    if (part_rank != work.my_rank) {  // Either send local KNN results
+      requests.resize(2);
+      handle.get_comms().isend(
+        work.res_I.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
+      ++request_idx;
+
+      handle.get_comms().isend(
+        work.res_D.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
+      ++request_idx;
+
+      if (params.knn_op != knn_operation::knn) {
+        requests.resize(2 + params.n_outputs);
+        for (std::size_t o = 0; o < params.n_outputs; o++) {
+          handle.get_comms().isend(work.res.data() + (o * batch_elms),
+                                   batch_elms,
+                                   part_rank,
+                                   0,
+                                   requests.data() + request_idx);
+          ++request_idx;
+        }
+      }
+    } else {  // Or, as the owner of currently processed query batch,
+              // receive results from other workers for reduce
+      bool part_rank_is_idx = work.idxRanks.find(part_rank) != work.idxRanks.end();
+      size_t idx_rank_size  = work.idxRanks.size();
+
+      // if root rank is an index, it will already have
+      // query data, so no need to receive from it.
+      work.res_I.resize(batch_elms * idx_rank_size, handle.get_stream());
+      work.res_D.resize(batch_elms * idx_rank_size, handle.get_stream());
+
+      if (params.knn_op != knn_operation::knn) {
+        work.res.resize(batch_elms * params.n_outputs * idx_rank_size, handle.get_stream());
       }
-    }
-  } else {  // Or, as the owner of currently processed query batch,
-            // receive results from other workers for reduce
-    bool part_rank_is_idx = work.idxRanks.find(part_rank) != work.idxRanks.end();
-    size_t idx_rank_size  = work.idxRanks.size();
 
-    // if root rank is an index, it will already have
-    // query data, so no need to receive from it.
-    work.res_I.resize(batch_elms * idx_rank_size, handle.get_stream());
-    work.res_D.resize(batch_elms * idx_rank_size, handle.get_stream());
+      if (part_rank_is_idx) {
+        /**
+         * If this worker (in charge of reduce),
+         * has some local results as well,
+         * copy them at right location
+         */
+        --idx_rank_size;
+        int i = 0;
+        for (int rank : work.idxRanks) {
+          if (rank == work.my_rank) {
+            size_t batch_offset = batch_elms * i;
+
+            // Indices and distances are stored in rank order
+            raft::copy_async(
+              work.res_I.data() + batch_offset, work.res_I.data(), batch_elms, handle.get_stream());
+            raft::copy_async(
+              work.res_D.data() + batch_offset, work.res_D.data(), batch_elms, handle.get_stream());
+
+            if (params.knn_op != knn_operation::knn) {
+              rmm::device_uvector<out_t> tmp_res(params.n_outputs * batch_elms,
+                                                 handle.get_stream());
+              raft::copy_async(
+                tmp_res.data(), work.res.data(), tmp_res.size(), handle.get_stream());
+
+              for (std::size_t o = 0; o < params.n_outputs; ++o) {
+                // Outputs are stored in target order and then in rank order
+                raft::copy_async(
+                  work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset,
+                  tmp_res.data() + (o * batch_elms),
+                  batch_elms,
+                  handle.get_stream());
+              }
+            }
+            handle.sync_stream(handle.get_stream());
+            break;
+          }
+          i++;
+        }
+      }
 
-    if (params.knn_op != knn_operation::knn) {
-      work.res.resize(batch_elms * params.n_outputs * idx_rank_size, handle.get_stream());
-    }
+      size_t request_size = 2 * idx_rank_size;
+      if (params.knn_op != knn_operation::knn)
+        request_size = (2 + params.n_outputs) * idx_rank_size;
+      requests.resize(request_size);
 
-    if (part_rank_is_idx) {
-      /**
-       * If this worker (in charge of reduce),
-       * has some local results as well,
-       * copy them at right location
-       */
-      --idx_rank_size;
-      int i = 0;
+      int num_received = 0;
       for (int rank : work.idxRanks) {
-        if (rank == work.my_rank) {
-          size_t batch_offset = batch_elms * i;
+        if (rank != work.my_rank) {
+          size_t batch_offset = batch_elms * num_received;
 
           // Indices and distances are stored in rank order
-          raft::copy_async(
-            work.res_I.data() + batch_offset, work.res_I.data(), batch_elms, handle.get_stream());
-          raft::copy_async(
-            work.res_D.data() + batch_offset, work.res_D.data(), batch_elms, handle.get_stream());
+          handle.get_comms().irecv(
+            work.res_I.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
+          ++request_idx;
+          handle.get_comms().irecv(
+            work.res_D.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
+          ++request_idx;
 
           if (params.knn_op != knn_operation::knn) {
-            rmm::device_uvector<out_t> tmp_res(params.n_outputs * batch_elms, handle.get_stream());
-            raft::copy_async(tmp_res.data(), work.res.data(), tmp_res.size(), handle.get_stream());
-
-            for (std::size_t o = 0; o < params.n_outputs; ++o) {
+            for (std::size_t o = 0; o < params.n_outputs; o++) {
               // Outputs are stored in target order and then in rank order
-              raft::copy_async(
-                work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset,
-                tmp_res.data() + (o * batch_elms),
-                batch_elms,
-                handle.get_stream());
+              out_t* r = work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset;
+              handle.get_comms().irecv(r, batch_elms, rank, 0, requests.data() + request_idx);
+              ++request_idx;
             }
           }
-          handle.sync_stream(handle.get_stream());
-          break;
         }
-        i++;
+        if (rank != work.my_rank || part_rank_is_idx) {
+          /**
+           * Increase index for each new reception
+           * Also increase index when the worker doing a reduce operation
+           * has some index data (previously copied at right location).
+           */
+          ++num_received;
+        }
       }
     }
 
-    size_t request_size = 2 * idx_rank_size;
-    if (params.knn_op != knn_operation::knn) request_size = (2 + params.n_outputs) * idx_rank_size;
-    requests.resize(request_size);
-
-    int num_received = 0;
-    for (int rank : work.idxRanks) {
-      if (rank != work.my_rank) {
-        size_t batch_offset = batch_elms * num_received;
-
-        // Indices and distances are stored in rank order
-        handle.get_comms().irecv(
-          work.res_I.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
-        ++request_idx;
-        handle.get_comms().irecv(
-          work.res_D.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
-        ++request_idx;
-
-        if (params.knn_op != knn_operation::knn) {
-          for (std::size_t o = 0; o < params.n_outputs; o++) {
-            // Outputs are stored in target order and then in rank order
-            out_t* r = work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset;
-            handle.get_comms().irecv(r, batch_elms, rank, 0, requests.data() + request_idx);
-            ++request_idx;
-          }
-        }
-      }
-      if (rank != work.my_rank || part_rank_is_idx) {
-        /**
-         * Increase index for each new reception
-         * Also increase index when the worker doing a reduce operation
-         * has some index data (previously copied at right location).
-         */
-        ++num_received;
-      }
+    try {
+      handle.get_comms().waitall(requests.size(), requests.data());
+    } catch (raft::exception& e) {
+      CUML_LOG_DEBUG("FAILURE!");
     }
   }
 
-  try {
-    handle.get_comms().waitall(requests.size(), requests.data());
-  } catch (raft::exception& e) {
-    CUML_LOG_DEBUG("FAILURE!");
-  }
-}
-
-/*!
- Reduce all local results to a global result for a given query batch
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[in] part_idx Partition index of query batch
- @param[in] processed_in_part Number of queries already processed in part (serves as offset)
- @param[in] batch_size Batch size
- */
-template <typename in_t,
-          typename ind_t,
-          typename dist_t,
-          typename out_t,
-          typename trans_t = int64_t>
-void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-            opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-            raft::handle_t& handle,
-            int part_idx,
-            size_t processed_in_part,
-            size_t batch_size)
-{
-  rmm::device_uvector<trans_t> trans(work.idxRanks.size(), handle.get_stream());
-  RAFT_CUDA_TRY(
-    cudaMemsetAsync(trans.data(), 0, work.idxRanks.size() * sizeof(trans_t), handle.get_stream()));
-
-  size_t batch_offset = processed_in_part * params.k;
-
-  ind_t* indices    = nullptr;
-  dist_t* distances = nullptr;
-
-  rmm::device_uvector<ind_t> indices_b(0, handle.get_stream());
-  rmm::device_uvector<dist_t> distances_b(0, handle.get_stream());
-
-  if (params.knn_op == knn_operation::knn) {
-    indices   = params.out_I->at(part_idx)->ptr + batch_offset;
-    distances = params.out_D->at(part_idx)->ptr + batch_offset;
-  } else {
-    indices_b.resize(batch_size * params.k, handle.get_stream());
-    distances_b.resize(batch_size * params.k, handle.get_stream());
-    indices   = indices_b.data();
-    distances = distances_b.data();
-  }
+  /*!
+   Reduce all local results to a global result for a given query batch
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[in] part_idx Partition index of query batch
+   @param[in] processed_in_part Number of queries already processed in part (serves as offset)
+   @param[in] batch_size Batch size
+   */
+  template <typename in_t,
+            typename ind_t,
+            typename dist_t,
+            typename out_t,
+            typename trans_t = int64_t>
+  void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+              opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+              raft::handle_t& handle,
+              int part_idx,
+              size_t processed_in_part,
+              size_t batch_size)
+  {
+    rmm::device_uvector<trans_t> trans(work.idxRanks.size(), handle.get_stream());
+    RAFT_CUDA_TRY(cudaMemsetAsync(
+      trans.data(), 0, work.idxRanks.size() * sizeof(trans_t), handle.get_stream()));
+
+    size_t batch_offset = processed_in_part * params.k;
+
+    ind_t* indices    = nullptr;
+    dist_t* distances = nullptr;
 
-  // Merge all KNN local results
-  raft::spatial::knn::knn_merge_parts(work.res_D.data(),
-                                      work.res_I.data(),
-                                      distances,
-                                      indices,
-                                      batch_size,
-                                      work.idxRanks.size(),
-                                      params.k,
-                                      handle.get_stream(),
-                                      trans.data());
-  handle.sync_stream(handle.get_stream());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  if (params.knn_op != knn_operation::knn) {
-    rmm::device_uvector<out_t> merged_outputs_b(params.n_outputs * batch_size * params.k,
-                                                handle.get_stream());
-    // Get the right labels for indices obtained after local KNN searches
-    merge_labels(params,
-                 work,
-                 handle,
-                 merged_outputs_b.data(),
-                 indices,
-                 work.res.data(),
-                 work.res_I.data(),
-                 batch_size);
-
-    out_t* outputs = nullptr;
-    std::vector<float*> probas_with_offsets;
-
-    if (params.knn_op != knn_operation::class_proba) {
-      outputs = params.out->at(part_idx)->ptr + (processed_in_part * params.n_outputs);
+    rmm::device_uvector<ind_t> indices_b(0, handle.get_stream());
+    rmm::device_uvector<dist_t> distances_b(0, handle.get_stream());
+
+    if (params.knn_op == knn_operation::knn) {
+      indices   = params.out_I->at(part_idx)->ptr + batch_offset;
+      distances = params.out_D->at(part_idx)->ptr + batch_offset;
     } else {
-      std::vector<float*>& probas_part = params.probas->at(part_idx);
-      for (std::size_t i = 0; i < params.n_outputs; i++) {
-        float* ptr           = probas_part[i];
-        int n_unique_classes = params.n_unique->at(i);
-        probas_with_offsets.push_back(ptr + (processed_in_part * n_unique_classes));
-      }
+      indices_b.resize(batch_size * params.k, handle.get_stream());
+      distances_b.resize(batch_size * params.k, handle.get_stream());
+      indices   = indices_b.data();
+      distances = distances_b.data();
     }
 
-    // Perform final classification, regression or class-proba operation
-    perform_local_operation(
-      params, work, handle, outputs, probas_with_offsets, merged_outputs_b.data(), batch_size);
-
+    // Merge all KNN local results
+    raft::spatial::knn::knn_merge_parts(work.res_D.data(),
+                                        work.res_I.data(),
+                                        distances,
+                                        indices,
+                                        batch_size,
+                                        work.idxRanks.size(),
+                                        params.k,
+                                        handle.get_stream(),
+                                        trans.data());
     handle.sync_stream(handle.get_stream());
     RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-}
-
-/**
- * This function copies the labels associated to the merged indices
- * from the unmerged to a merged (n_ranks times smaller) array of labels
- * @param[out] outputs merged labels
- * @param[in] knn_indices merged indices
- * @param[in] unmerged_outputs unmerged labels
- * @param[in] unmerged_knn_indices unmerged indices
- * @param[in] offsets array splitting the partitions making it possible
- * to identify the origin partition of an nearest neighbor index
- * @param[in] parts_to_ranks get rank index from index partition index,
- * informative to find positions as the unmerged arrays are built
- * so that ranks are in order (unlike partitions)
- * @param[in] nearest_neighbors number of nearest neighbors to look for in query
- * @param[in] n_outputs number of targets
- * @param[in] n_labels number of labels to write (batch_size * n_outputs)
- * @param[in] n_parts number of index partitions
- * @param[in] n_ranks number of index ranks
- */
-template <int TPB_X, typename dist_t, typename out_t>
-__global__ void merge_labels_kernel(out_t* outputs,
-                                    dist_t* knn_indices,
-                                    out_t* unmerged_outputs,
-                                    dist_t* unmerged_knn_indices,
-                                    size_t* offsets,
-                                    int* parts_to_ranks,
-                                    int nearest_neighbors,
-                                    int n_outputs,
-                                    int n_labels,
-                                    int n_parts,
-                                    int n_ranks)
-{
-  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (i >= n_labels) return;
-  uint64_t nn_idx = knn_indices[i];
-  int part_idx    = 0;
-  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
-  part_idx         = std::min(std::max(0, part_idx - 1), n_parts - 1);
-  int rank_idx     = parts_to_ranks[part_idx];
-  int inbatch_idx  = i / nearest_neighbors;
-  uint64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
-  for (int k = 0; k < nearest_neighbors; k++) {
-    if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
-      for (int o = 0; o < n_outputs; o++) {
-        outputs[(o * n_labels) + i] = unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
+
+    if (params.knn_op != knn_operation::knn) {
+      rmm::device_uvector<out_t> merged_outputs_b(params.n_outputs * batch_size * params.k,
+                                                  handle.get_stream());
+      // Get the right labels for indices obtained after local KNN searches
+      merge_labels(params,
+                   work,
+                   handle,
+                   merged_outputs_b.data(),
+                   indices,
+                   work.res.data(),
+                   work.res_I.data(),
+                   batch_size);
+
+      out_t* outputs = nullptr;
+      std::vector<float*> probas_with_offsets;
+
+      if (params.knn_op != knn_operation::class_proba) {
+        outputs = params.out->at(part_idx)->ptr + (processed_in_part * params.n_outputs);
+      } else {
+        std::vector<float*>& probas_part = params.probas->at(part_idx);
+        for (std::size_t i = 0; i < params.n_outputs; i++) {
+          float* ptr           = probas_part[i];
+          int n_unique_classes = params.n_unique->at(i);
+          probas_with_offsets.push_back(ptr + (processed_in_part * n_unique_classes));
+        }
       }
-      return;
+
+      // Perform final classification, regression or class-proba operation
+      perform_local_operation(
+        params, work, handle, outputs, probas_with_offsets, merged_outputs_b.data(), batch_size);
+
+      handle.sync_stream(handle.get_stream());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
-}
-
-/*!
- Get the right labels for indices obtained after local KNN searches
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[out] output KNN outputs output array
- @param[out] knn_indices KNN class-probas output array (class-proba only)
- @param[in] unmerged_outputs KNN labels input array
- @param[in] unmerged_knn_indices Batch size
- @param[in] batch_size Batch size
- */
-template <typename opg_knn_param_t, typename opg_knn_work_t, typename ind_t, typename out_t>
-void merge_labels(opg_knn_param_t& params,
-                  opg_knn_work_t& work,
-                  raft::handle_t& handle,
-                  out_t* output,
-                  ind_t* knn_indices,
-                  out_t* unmerged_outputs,
-                  ind_t* unmerged_knn_indices,
-                  int batch_size)
-{
-  const int TPB_X = 256;
-  int n_labels    = batch_size * params.k;
-  dim3 grid(raft::ceildiv(n_labels, TPB_X));
-  dim3 blk(TPB_X);
-
-  int offset = 0;
-  std::vector<uint64_t> offsets_h;
-  for (auto& rsp : work.idxPartsToRanks) {
-    offsets_h.push_back(offset);
-    offset += rsp->size;
-  }
-  rmm::device_uvector<uint64_t> offsets_d(offsets_h.size(), handle.get_stream());
-  raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(), handle.get_stream());
-
-  std::vector<int> parts_to_ranks_h;
-  for (auto& rsp : work.idxPartsToRanks) {
-    int i = 0;
-    for (int rank : work.idxRanks) {
-      if (rank == rsp->rank) { parts_to_ranks_h.push_back(i); }
-      ++i;
+
+  /**
+   * This function copies the labels associated to the merged indices
+   * from the unmerged to a merged (n_ranks times smaller) array of labels
+   * @param[out] outputs merged labels
+   * @param[in] knn_indices merged indices
+   * @param[in] unmerged_outputs unmerged labels
+   * @param[in] unmerged_knn_indices unmerged indices
+   * @param[in] offsets array splitting the partitions making it possible
+   * to identify the origin partition of an nearest neighbor index
+   * @param[in] parts_to_ranks get rank index from index partition index,
+   * informative to find positions as the unmerged arrays are built
+   * so that ranks are in order (unlike partitions)
+   * @param[in] nearest_neighbors number of nearest neighbors to look for in query
+   * @param[in] n_outputs number of targets
+   * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+   * @param[in] n_parts number of index partitions
+   * @param[in] n_ranks number of index ranks
+   */
+  template <int TPB_X, typename dist_t, typename out_t>
+  __global__ void merge_labels_kernel(out_t* outputs,
+                                      dist_t* knn_indices,
+                                      out_t* unmerged_outputs,
+                                      dist_t* unmerged_knn_indices,
+                                      size_t* offsets,
+                                      int* parts_to_ranks,
+                                      int nearest_neighbors,
+                                      int n_outputs,
+                                      int n_labels,
+                                      int n_parts,
+                                      int n_ranks)
+  {
+    uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+    if (i >= n_labels) return;
+    uint64_t nn_idx = knn_indices[i];
+    int part_idx    = 0;
+    for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
+    part_idx         = std::min(std::max(0, part_idx - 1), n_parts - 1);
+    int rank_idx     = parts_to_ranks[part_idx];
+    int inbatch_idx  = i / nearest_neighbors;
+    uint64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
+    for (int k = 0; k < nearest_neighbors; k++) {
+      if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
+        for (int o = 0; o < n_outputs; o++) {
+          outputs[(o * n_labels) + i] = unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
+        }
+        return;
+      }
     }
   }
-  rmm::device_uvector<int> parts_to_ranks_d(parts_to_ranks_h.size(), handle.get_stream());
-  raft::update_device(
-    parts_to_ranks_d.data(), parts_to_ranks_h.data(), parts_to_ranks_h.size(), handle.get_stream());
-
-  merge_labels_kernel<TPB_X><<<grid, blk, 0, handle.get_stream()>>>(output,
-                                                                    knn_indices,
-                                                                    unmerged_outputs,
-                                                                    unmerged_knn_indices,
-                                                                    offsets_d.data(),
-                                                                    parts_to_ranks_d.data(),
-                                                                    params.k,
-                                                                    params.n_outputs,
-                                                                    n_labels,
-                                                                    work.idxPartsToRanks.size(),
-                                                                    work.idxRanks.size());
-}
-
-/*!
- Perform final classification, regression or class-proba operation for a given query batch
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[out] outputs KNN outputs output array
- @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
- @param[in] labels KNN labels input array
- @param[in] batch_size Batch size
- */
-template <typename in_t,
-          typename ind_t,
-          typename dist_t,
-          typename out_t,
-          typename std::enable_if<std::is_floating_point<out_t>::value>::type* = nullptr>
-void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                             opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                             raft::handle_t& handle,
-                             out_t* outputs,
-                             std::vector<float*>& probas_with_offsets,
-                             out_t* labels,
-                             size_t batch_size)
-{
-  size_t n_labels = batch_size * params.k;
-  std::vector<out_t*> y(params.n_outputs);
-  for (std::size_t o = 0; o < params.n_outputs; o++) {
-    y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+
+  /*!
+   Get the right labels for indices obtained after local KNN searches
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[out] output KNN outputs output array
+   @param[out] knn_indices KNN class-probas output array (class-proba only)
+   @param[in] unmerged_outputs KNN labels input array
+   @param[in] unmerged_knn_indices Batch size
+   @param[in] batch_size Batch size
+   */
+  template <typename opg_knn_param_t, typename opg_knn_work_t, typename ind_t, typename out_t>
+  void merge_labels(opg_knn_param_t& params,
+                    opg_knn_work_t& work,
+                    raft::handle_t& handle,
+                    out_t* output,
+                    ind_t* knn_indices,
+                    out_t* unmerged_outputs,
+                    ind_t* unmerged_knn_indices,
+                    int batch_size)
+  {
+    const int TPB_X = 256;
+    int n_labels    = batch_size * params.k;
+    dim3 grid(raft::ceildiv(n_labels, TPB_X));
+    dim3 blk(TPB_X);
+
+    int offset = 0;
+    std::vector<uint64_t> offsets_h;
+    for (auto& rsp : work.idxPartsToRanks) {
+      offsets_h.push_back(offset);
+      offset += rsp->size;
+    }
+    rmm::device_uvector<uint64_t> offsets_d(offsets_h.size(), handle.get_stream());
+    raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(), handle.get_stream());
+
+    std::vector<int> parts_to_ranks_h;
+    for (auto& rsp : work.idxPartsToRanks) {
+      int i = 0;
+      for (int rank : work.idxRanks) {
+        if (rank == rsp->rank) { parts_to_ranks_h.push_back(i); }
+        ++i;
+      }
+    }
+    rmm::device_uvector<int> parts_to_ranks_d(parts_to_ranks_h.size(), handle.get_stream());
+    raft::update_device(parts_to_ranks_d.data(),
+                        parts_to_ranks_h.data(),
+                        parts_to_ranks_h.size(),
+                        handle.get_stream());
+
+    merge_labels_kernel<TPB_X><<<grid, blk, 0, handle.get_stream()>>>(output,
+                                                                      knn_indices,
+                                                                      unmerged_outputs,
+                                                                      unmerged_knn_indices,
+                                                                      offsets_d.data(),
+                                                                      parts_to_ranks_d.data(),
+                                                                      params.k,
+                                                                      params.n_outputs,
+                                                                      n_labels,
+                                                                      work.idxPartsToRanks.size(),
+                                                                      work.idxRanks.size());
   }
 
-  MLCommon::Selection::knn_regress<float, 32, true>(
-    handle, outputs, nullptr, y, n_labels, batch_size, params.k);
-}
-
-/*!
- Perform final classification, regression or class-proba operation for a given query batch
- @param[in] params Parameters for distrbuted KNN operation
- @param[in] work Current work for distributed KNN
- @param[in] handle RAFT handle
- @param[out] outputs KNN outputs output array
- @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
- @param[in] labels KNN labels input array
- @param[in] batch_size Batch size
- */
-template <typename in_t,
-          typename ind_t,
-          typename dist_t,
-          typename out_t,
-          typename std::enable_if<std::is_integral<out_t>::value>::type* = nullptr>
-void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                             opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                             raft::handle_t& handle,
-                             out_t* outputs,
-                             std::vector<float*>& probas_with_offsets,
-                             out_t* labels,
-                             size_t batch_size)
-{
-  size_t n_labels = batch_size * params.k;
-  std::vector<out_t*> y(params.n_outputs);
-  for (std::size_t o = 0; o < params.n_outputs; o++) {
-    y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+  /*!
+   Perform final classification, regression or class-proba operation for a given query batch
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[out] outputs KNN outputs output array
+   @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+   @param[in] labels KNN labels input array
+   @param[in] batch_size Batch size
+   */
+  template <typename in_t,
+            typename ind_t,
+            typename dist_t,
+            typename out_t,
+            typename std::enable_if<std::is_floating_point<out_t>::value>::type* = nullptr>
+  void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                               opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                               raft::handle_t& handle,
+                               out_t* outputs,
+                               std::vector<float*>& probas_with_offsets,
+                               out_t* labels,
+                               size_t batch_size)
+  {
+    size_t n_labels = batch_size * params.k;
+    std::vector<out_t*> y(params.n_outputs);
+    for (std::size_t o = 0; o < params.n_outputs; o++) {
+      y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+    }
+
+    MLCommon::Selection::knn_regress<float, 32, true>(
+      handle, outputs, nullptr, y, n_labels, batch_size, params.k);
   }
 
-  switch (params.knn_op) {
-    case knn_operation::classification:
-      MLCommon::Selection::knn_classify<32, true>(handle,
-                                                  outputs,
-                                                  nullptr,
-                                                  y,
-                                                  n_labels,
-                                                  batch_size,
-                                                  params.k,
-                                                  *(params.uniq_labels),
-                                                  *(params.n_unique));
-      break;
-    case knn_operation::class_proba:
-      MLCommon::Selection::class_probs<32, true>(handle,
-                                                 probas_with_offsets,
-                                                 nullptr,
-                                                 y,
-                                                 n_labels,
-                                                 batch_size,
-                                                 params.k,
-                                                 *(params.uniq_labels),
-                                                 *(params.n_unique));
-      break;
-    default: CUML_LOG_DEBUG("FAILURE!");
+  /*!
+   Perform final classification, regression or class-proba operation for a given query batch
+   @param[in] params Parameters for distrbuted KNN operation
+   @param[in] work Current work for distributed KNN
+   @param[in] handle RAFT handle
+   @param[out] outputs KNN outputs output array
+   @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+   @param[in] labels KNN labels input array
+   @param[in] batch_size Batch size
+   */
+  template <typename in_t,
+            typename ind_t,
+            typename dist_t,
+            typename out_t,
+            typename std::enable_if<std::is_integral<out_t>::value>::type* = nullptr>
+  void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                               opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                               raft::handle_t& handle,
+                               out_t* outputs,
+                               std::vector<float*>& probas_with_offsets,
+                               out_t* labels,
+                               size_t batch_size)
+  {
+    size_t n_labels = batch_size * params.k;
+    std::vector<out_t*> y(params.n_outputs);
+    for (std::size_t o = 0; o < params.n_outputs; o++) {
+      y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+    }
+
+    switch (params.knn_op) {
+      case knn_operation::classification:
+        MLCommon::Selection::knn_classify<32, true>(handle,
+                                                    outputs,
+                                                    nullptr,
+                                                    y,
+                                                    n_labels,
+                                                    batch_size,
+                                                    params.k,
+                                                    *(params.uniq_labels),
+                                                    *(params.n_unique));
+        break;
+      case knn_operation::class_proba:
+        MLCommon::Selection::class_probs<32, true>(handle,
+                                                   probas_with_offsets,
+                                                   nullptr,
+                                                   y,
+                                                   n_labels,
+                                                   batch_size,
+                                                   params.k,
+                                                   *(params.uniq_labels),
+                                                   *(params.n_unique));
+        break;
+      default: CUML_LOG_DEBUG("FAILURE!");
+    }
   }
-}
 
-};  // namespace knn_common
-};  // namespace opg
-};  // namespace KNN
-};  // namespace ML
+  };  // namespace knn_common
+  };  // namespace opg
+  };  // namespace KNN
+};    // namespace ML
diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index 4106682cc9..00e675b226 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -26,50 +26,51 @@
 
 #include <cusparse_v2.h>
 
-namespace ML {
-namespace Sparse {
-
-void brute_force_knn(raft::handle_t& handle,
-                     const int* idx_indptr,
-                     const int* idx_indices,
-                     const float* idx_data,
-                     size_t idx_nnz,
-                     int n_idx_rows,
-                     int n_idx_cols,
-                     const int* query_indptr,
-                     const int* query_indices,
-                     const float* query_data,
-                     size_t query_nnz,
-                     int n_query_rows,
-                     int n_query_cols,
-                     int* output_indices,
-                     float* output_dists,
-                     int k,
-                     size_t batch_size_index,  // approx 1M
-                     size_t batch_size_query,
-                     raft::distance::DistanceType metric,
-                     float metricArg)
+namespace ML
 {
-  raft::sparse::selection::brute_force_knn(idx_indptr,
-                                           idx_indices,
-                                           idx_data,
-                                           idx_nnz,
-                                           n_idx_rows,
-                                           n_idx_cols,
-                                           query_indptr,
-                                           query_indices,
-                                           query_data,
-                                           query_nnz,
-                                           n_query_rows,
-                                           n_query_cols,
-                                           output_indices,
-                                           output_dists,
-                                           k,
-                                           handle,
-                                           batch_size_index,
-                                           batch_size_query,
-                                           metric,
-                                           metricArg);
-}
-};  // namespace Sparse
-};  // namespace ML
+  namespace Sparse {
+
+  void brute_force_knn(raft::handle_t& handle,
+                       const int* idx_indptr,
+                       const int* idx_indices,
+                       const float* idx_data,
+                       size_t idx_nnz,
+                       int n_idx_rows,
+                       int n_idx_cols,
+                       const int* query_indptr,
+                       const int* query_indices,
+                       const float* query_data,
+                       size_t query_nnz,
+                       int n_query_rows,
+                       int n_query_cols,
+                       int* output_indices,
+                       float* output_dists,
+                       int k,
+                       size_t batch_size_index,  // approx 1M
+                       size_t batch_size_query,
+                       raft::distance::DistanceType metric,
+                       float metricArg)
+  {
+    raft::sparse::selection::brute_force_knn(idx_indptr,
+                                             idx_indices,
+                                             idx_data,
+                                             idx_nnz,
+                                             n_idx_rows,
+                                             n_idx_cols,
+                                             query_indptr,
+                                             query_indices,
+                                             query_data,
+                                             query_nnz,
+                                             n_query_rows,
+                                             n_query_cols,
+                                             output_indices,
+                                             output_dists,
+                                             k,
+                                             handle,
+                                             batch_size_index,
+                                             batch_size_query,
+                                             metric,
+                                             metricArg);
+  }
+  };  // namespace Sparse
+};    // namespace ML
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index 527a4c4148..6f392bd076 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -29,46 +29,47 @@
 
 #include <raft/core/handle.hpp>
 
-namespace ML {
-namespace Metrics {
-
-/**
- * @brief Compute the trustworthiness score
- *
- * @param h Raft handle
- * @param X Data in original dimension
- * @param X_embedded Data in target dimension (embedding)
- * @param n Number of samples
- * @param m Number of features in high/original dimension
- * @param d Number of features in low/embedded dimension
- * @param n_neighbors Number of neighbors considered by trustworthiness score
- * @param batchSize Batch size
- * @tparam distance_type: Distance type to consider
- * @return Trustworthiness score
- */
-template <typename math_t, raft::distance::DistanceType distance_type>
-double trustworthiness_score(const raft::handle_t& h,
-                             const math_t* X,
-                             math_t* X_embedded,
-                             int n,
-                             int m,
-                             int d,
-                             int n_neighbors,
-                             int batchSize)
+namespace ML
 {
-  return raft::stats::trustworthiness_score<math_t, distance_type>(
-    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
-}
+  namespace Metrics {
+
+  /**
+   * @brief Compute the trustworthiness score
+   *
+   * @param h Raft handle
+   * @param X Data in original dimension
+   * @param X_embedded Data in target dimension (embedding)
+   * @param n Number of samples
+   * @param m Number of features in high/original dimension
+   * @param d Number of features in low/embedded dimension
+   * @param n_neighbors Number of neighbors considered by trustworthiness score
+   * @param batchSize Batch size
+   * @tparam distance_type: Distance type to consider
+   * @return Trustworthiness score
+   */
+  template <typename math_t, raft::distance::DistanceType distance_type>
+  double trustworthiness_score(const raft::handle_t& h,
+                               const math_t* X,
+                               math_t* X_embedded,
+                               int n,
+                               int m,
+                               int d,
+                               int n_neighbors,
+                               int batchSize)
+  {
+    return raft::stats::trustworthiness_score<math_t, distance_type>(
+      h, X, X_embedded, n, m, d, n_neighbors, batchSize);
+  }
 
-template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
-  const raft::handle_t& h,
-  const float* X,
-  float* X_embedded,
-  int n,
-  int m,
-  int d,
-  int n_neighbors,
-  int batchSize);
+  template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
+    const raft::handle_t& h,
+    const float* X,
+    float* X_embedded,
+    int n,
+    int m,
+    int d,
+    int n_neighbors,
+    int batchSize);
 
-};  // end namespace Metrics
-};  // end namespace ML
+  };  // end namespace Metrics
+};    // end namespace ML
diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh
index 961f44360e..2e68a83b9f 100644
--- a/cpp/src/randomforest/randomforest.cuh
+++ b/cpp/src/randomforest/randomforest.cuh
@@ -45,274 +45,277 @@
 
 #include <map>
 
-namespace ML {
-
-template <class T, class L>
-class RandomForest {
- protected:
-  RF_params rf_params;  // structure containing RF hyperparameters
-  int rf_type;          // 0 for classification 1 for regression
-
-  void get_row_sample(int tree_id,
-                      int n_rows,
-                      rmm::device_uvector<int>* selected_rows,
-                      const cudaStream_t stream)
-  {
-    raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh");
-
-    // Hash these together so they are uncorrelated
-    auto rs = DT::fnv1a32_basis;
-    rs      = DT::fnv1a32(rs, rf_params.seed);
-    rs      = DT::fnv1a32(rs, tree_id);
-    raft::random::Rng rng(rs, raft::random::GenPhilox);
-    if (rf_params.bootstrap) {
-      // Use bootstrapped sample set
-      rng.uniformInt<int>(selected_rows->data(), selected_rows->size(), 0, n_rows, stream);
-
-    } else {
-      // Use all the samples from the dataset
-      thrust::sequence(thrust::cuda::par.on(stream), selected_rows->begin(), selected_rows->end());
-    }
-  }
+namespace ML
+{
+  template <class T, class L>
+  class RandomForest {
+   protected:
+    RF_params rf_params;  // structure containing RF hyperparameters
+    int rf_type;          // 0 for classification 1 for regression
+
+    void get_row_sample(int tree_id,
+                        int n_rows,
+                        rmm::device_uvector<int>* selected_rows,
+                        const cudaStream_t stream)
+    {
+      raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh");
+
+      // Hash these together so they are uncorrelated
+      auto rs = DT::fnv1a32_basis;
+      rs      = DT::fnv1a32(rs, rf_params.seed);
+      rs      = DT::fnv1a32(rs, tree_id);
+      raft::random::Rng rng(rs, raft::random::GenPhilox);
+      if (rf_params.bootstrap) {
+        // Use bootstrapped sample set
+        rng.uniformInt<int>(selected_rows->data(), selected_rows->size(), 0, n_rows, stream);
 
-  void error_checking(const T* input, L* predictions, int n_rows, int n_cols, bool predict) const
-  {
-    if (predict) {
-      ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions.");
+      } else {
+        // Use all the samples from the dataset
+        thrust::sequence(
+          thrust::cuda::par.on(stream), selected_rows->begin(), selected_rows->end());
+      }
     }
-    ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows);
-    ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols);
 
-    bool input_is_dev_ptr = DT::is_dev_ptr(input);
-    bool preds_is_dev_ptr = DT::is_dev_ptr(predictions);
+    void error_checking(const T* input, L* predictions, int n_rows, int n_cols, bool predict) const
+    {
+      if (predict) {
+        ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions.");
+      }
+      ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows);
+      ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols);
 
-    if (!input_is_dev_ptr || (input_is_dev_ptr != preds_is_dev_ptr)) {
-      ASSERT(false,
-             "RF Error: Expected both input and labels/predictions to be GPU "
-             "pointers");
-    }
-  }
-
- public:
-  /**
-   * @brief Construct RandomForest object.
-   * @param[in] cfg_rf_params: Random forest hyper-parameter struct.
-   * @param[in] cfg_rf_type: Task type: 0 for classification, 1 for regression
-   */
-  RandomForest(RF_params cfg_rf_params, int cfg_rf_type = RF_type::CLASSIFICATION)
-    : rf_params(cfg_rf_params), rf_type(cfg_rf_type){};
-
-  /**
-   * @brief Build (i.e., fit, train) random forest for input data.
-   * @param[in] user_handle: raft::handle_t
-   * @param[in] input: train data (n_rows samples, n_cols features) in column major format,
-   *   excluding labels. Device pointer.
-   * @param[in] n_rows: number of training data samples.
-   * @param[in] n_cols: number of features (i.e., columns) excluding target feature.
-   * @param[in] labels: 1D array of target predictions/labels. Device Pointer.
-            For classification task, only labels of type int are supported.
-              Assumption: labels were preprocessed to map to ascending numbers from 0;
-              needed for current gini impl in decision tree
-            For regression task, the labels (predictions) can be float or double data type.
-  * @param[in] n_unique_labels: (meaningful only for classification) #unique label values (known
-  during preprocessing)
-  * @param[in] forest: CPU point to RandomForestMetaData struct.
-  */
-  void fit(const raft::handle_t& user_handle,
-           const T* input,
-           int n_rows,
-           int n_cols,
-           L* labels,
-           int n_unique_labels,
-           RandomForestMetaData<T, L>*& forest)
-  {
-    raft::common::nvtx::range fun_scope("RandomForest::fit @randomforest.cuh");
-    this->error_checking(input, labels, n_rows, n_cols, false);
-    const raft::handle_t& handle = user_handle;
-    int n_sampled_rows           = 0;
-    if (this->rf_params.bootstrap) {
-      n_sampled_rows = std::round(this->rf_params.max_samples * n_rows);
-    } else {
-      if (this->rf_params.max_samples != 1.0) {
-        CUML_LOG_WARN(
-          "If bootstrap sampling is disabled, max_samples value is ignored and "
-          "whole dataset is used for building each tree");
-        this->rf_params.max_samples = 1.0;
+      bool input_is_dev_ptr = DT::is_dev_ptr(input);
+      bool preds_is_dev_ptr = DT::is_dev_ptr(predictions);
+
+      if (!input_is_dev_ptr || (input_is_dev_ptr != preds_is_dev_ptr)) {
+        ASSERT(false,
+               "RF Error: Expected both input and labels/predictions to be GPU "
+               "pointers");
       }
-      n_sampled_rows = n_rows;
-    }
-    int n_streams = this->rf_params.n_streams;
-    ASSERT(static_cast<std::size_t>(n_streams) <= handle.get_stream_pool_size(),
-           "rf_params.n_streams (=%d) should be <= raft::handle_t.n_streams (=%lu)",
-           n_streams,
-           handle.get_stream_pool_size());
-
-    // computing the quantiles: last two return values are shared pointers to device memory
-    // encapsulated by quantiles struct
-    auto [quantiles, quantiles_array, n_bins_array] =
-      DT::computeQuantiles(handle, input, this->rf_params.tree_params.max_n_bins, n_rows, n_cols);
-
-    // n_streams should not be less than n_trees
-    if (this->rf_params.n_trees < n_streams) n_streams = this->rf_params.n_trees;
-
-    // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree.
-    // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device
-    // ptr.
-    // Use a deque instead of vector because it can be used on objects with a deleted copy
-    // constructor
-    std::deque<rmm::device_uvector<int>> selected_rows;
-    for (int i = 0; i < n_streams; i++) {
-      selected_rows.emplace_back(n_sampled_rows, handle.get_stream_from_stream_pool(i));
     }
 
+   public:
+    /**
+     * @brief Construct RandomForest object.
+     * @param[in] cfg_rf_params: Random forest hyper-parameter struct.
+     * @param[in] cfg_rf_type: Task type: 0 for classification, 1 for regression
+     */
+    RandomForest(RF_params cfg_rf_params, int cfg_rf_type = RF_type::CLASSIFICATION)
+      : rf_params(cfg_rf_params), rf_type(cfg_rf_type){};
+
+    /**
+     * @brief Build (i.e., fit, train) random forest for input data.
+     * @param[in] user_handle: raft::handle_t
+     * @param[in] input: train data (n_rows samples, n_cols features) in column major format,
+     *   excluding labels. Device pointer.
+     * @param[in] n_rows: number of training data samples.
+     * @param[in] n_cols: number of features (i.e., columns) excluding target feature.
+     * @param[in] labels: 1D array of target predictions/labels. Device Pointer.
+              For classification task, only labels of type int are supported.
+                Assumption: labels were preprocessed to map to ascending numbers from 0;
+                needed for current gini impl in decision tree
+              For regression task, the labels (predictions) can be float or double data type.
+    * @param[in] n_unique_labels: (meaningful only for classification) #unique label values (known
+    during preprocessing)
+    * @param[in] forest: CPU point to RandomForestMetaData struct.
+    */
+    void fit(const raft::handle_t& user_handle,
+             const T* input,
+             int n_rows,
+             int n_cols,
+             L* labels,
+             int n_unique_labels,
+             RandomForestMetaData<T, L>*& forest)
+    {
+      raft::common::nvtx::range fun_scope("RandomForest::fit @randomforest.cuh");
+      this->error_checking(input, labels, n_rows, n_cols, false);
+      const raft::handle_t& handle = user_handle;
+      int n_sampled_rows           = 0;
+      if (this->rf_params.bootstrap) {
+        n_sampled_rows = std::round(this->rf_params.max_samples * n_rows);
+      } else {
+        if (this->rf_params.max_samples != 1.0) {
+          CUML_LOG_WARN(
+            "If bootstrap sampling is disabled, max_samples value is ignored and "
+            "whole dataset is used for building each tree");
+          this->rf_params.max_samples = 1.0;
+        }
+        n_sampled_rows = n_rows;
+      }
+      int n_streams = this->rf_params.n_streams;
+      ASSERT(static_cast<std::size_t>(n_streams) <= handle.get_stream_pool_size(),
+             "rf_params.n_streams (=%d) should be <= raft::handle_t.n_streams (=%lu)",
+             n_streams,
+             handle.get_stream_pool_size());
+
+      // computing the quantiles: last two return values are shared pointers to device memory
+      // encapsulated by quantiles struct
+      auto [quantiles, quantiles_array, n_bins_array] =
+        DT::computeQuantiles(handle, input, this->rf_params.tree_params.max_n_bins, n_rows, n_cols);
+
+      // n_streams should not be less than n_trees
+      if (this->rf_params.n_trees < n_streams) n_streams = this->rf_params.n_trees;
+
+      // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree.
+      // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device
+      // ptr.
+      // Use a deque instead of vector because it can be used on objects with a deleted copy
+      // constructor
+      std::deque<rmm::device_uvector<int>> selected_rows;
+      for (int i = 0; i < n_streams; i++) {
+        selected_rows.emplace_back(n_sampled_rows, handle.get_stream_from_stream_pool(i));
+      }
+
 #pragma omp parallel for num_threads(n_streams)
-    for (int i = 0; i < this->rf_params.n_trees; i++) {
-      int stream_id = omp_get_thread_num();
-      auto s        = handle.get_stream_from_stream_pool(stream_id);
-
-      this->get_row_sample(i, n_rows, &selected_rows[stream_id], s);
-
-      /* Build individual tree in the forest.
-        - input is a pointer to orig data that have n_cols features and n_rows rows.
-        - n_sampled_rows: # rows sampled for tree's bootstrap sample.
-        - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements)
-          used to build the bootstrapped sample.
-          Expectation: Each tree node will contain (a) # n_sampled_rows and
-          (b) a pointer to a list of row numbers w.r.t original data.
-      */
-
-      forest->trees[i] = DT::DecisionTree::fit(handle,
-                                               s,
-                                               input,
-                                               n_cols,
-                                               n_rows,
-                                               labels,
-                                               &selected_rows[stream_id],
-                                               n_unique_labels,
-                                               this->rf_params.tree_params,
-                                               this->rf_params.seed,
-                                               quantiles,
-                                               i);
-    }
-    // Cleanup
-    handle.sync_stream_pool();
-    handle.sync_stream();
-  }
-
-  /**
-   * @brief Predict target feature for input data
-   * @param[in] user_handle: raft::handle_t.
-   * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer.
-   * @param[in] n_rows: number of  data samples.
-   * @param[in] n_cols: number of features (excluding target feature).
-   * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated.
-   * @param[in] verbosity: verbosity level for logging messages during execution
-   */
-  void predict(const raft::handle_t& user_handle,
-               const T* input,
-               int n_rows,
-               int n_cols,
-               L* predictions,
-               const RandomForestMetaData<T, L>* forest,
-               int verbosity) const
-  {
-    ML::Logger::get().setLevel(verbosity);
-    this->error_checking(input, predictions, n_rows, n_cols, true);
-    std::vector<L> h_predictions(n_rows);
-    cudaStream_t stream = user_handle.get_stream();
-
-    std::vector<T> h_input(std::size_t(n_rows) * n_cols);
-    raft::update_host(h_input.data(), input, std::size_t(n_rows) * n_cols, stream);
-    user_handle.sync_stream(stream);
-
-    int row_size = n_cols;
-
-    ML::PatternSetter _("%v");
-    for (int row_id = 0; row_id < n_rows; row_id++) {
-      std::vector<T> row_prediction(forest->trees[0]->num_outputs);
       for (int i = 0; i < this->rf_params.n_trees; i++) {
-        DT::DecisionTree::predict(user_handle,
-                                  *forest->trees[i],
-                                  &h_input[row_id * row_size],
-                                  1,
-                                  n_cols,
-                                  row_prediction.data(),
-                                  forest->trees[i]->num_outputs,
-                                  verbosity);
+        int stream_id = omp_get_thread_num();
+        auto s        = handle.get_stream_from_stream_pool(stream_id);
+
+        this->get_row_sample(i, n_rows, &selected_rows[stream_id], s);
+
+        /* Build individual tree in the forest.
+          - input is a pointer to orig data that have n_cols features and n_rows rows.
+          - n_sampled_rows: # rows sampled for tree's bootstrap sample.
+          - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements)
+            used to build the bootstrapped sample.
+            Expectation: Each tree node will contain (a) # n_sampled_rows and
+            (b) a pointer to a list of row numbers w.r.t original data.
+        */
+
+        forest->trees[i] = DT::DecisionTree::fit(handle,
+                                                 s,
+                                                 input,
+                                                 n_cols,
+                                                 n_rows,
+                                                 labels,
+                                                 &selected_rows[stream_id],
+                                                 n_unique_labels,
+                                                 this->rf_params.tree_params,
+                                                 this->rf_params.seed,
+                                                 quantiles,
+                                                 i);
       }
-      for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
-        row_prediction[k] /= this->rf_params.n_trees;
-      }
-      if (rf_type == RF_type::CLASSIFICATION) {  // classification task: use 'majority' prediction
-        L best_class = 0;
-        T best_prob  = 0.0;
+      // Cleanup
+      handle.sync_stream_pool();
+      handle.sync_stream();
+    }
+
+    /**
+     * @brief Predict target feature for input data
+     * @param[in] user_handle: raft::handle_t.
+     * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
+     * pointer.
+     * @param[in] n_rows: number of  data samples.
+     * @param[in] n_cols: number of features (excluding target feature).
+     * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated.
+     * @param[in] verbosity: verbosity level for logging messages during execution
+     */
+    void predict(const raft::handle_t& user_handle,
+                 const T* input,
+                 int n_rows,
+                 int n_cols,
+                 L* predictions,
+                 const RandomForestMetaData<T, L>* forest,
+                 int verbosity) const
+    {
+      ML::Logger::get().setLevel(verbosity);
+      this->error_checking(input, predictions, n_rows, n_cols, true);
+      std::vector<L> h_predictions(n_rows);
+      cudaStream_t stream = user_handle.get_stream();
+
+      std::vector<T> h_input(std::size_t(n_rows) * n_cols);
+      raft::update_host(h_input.data(), input, std::size_t(n_rows) * n_cols, stream);
+      user_handle.sync_stream(stream);
+
+      int row_size = n_cols;
+
+      ML::PatternSetter _("%v");
+      for (int row_id = 0; row_id < n_rows; row_id++) {
+        std::vector<T> row_prediction(forest->trees[0]->num_outputs);
+        for (int i = 0; i < this->rf_params.n_trees; i++) {
+          DT::DecisionTree::predict(user_handle,
+                                    *forest->trees[i],
+                                    &h_input[row_id * row_size],
+                                    1,
+                                    n_cols,
+                                    row_prediction.data(),
+                                    forest->trees[i]->num_outputs,
+                                    verbosity);
+        }
         for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
-          if (row_prediction[k] > best_prob) {
-            best_class = k;
-            best_prob  = row_prediction[k];
-          }
+          row_prediction[k] /= this->rf_params.n_trees;
         }
+        if (rf_type == RF_type::CLASSIFICATION) {  // classification task: use 'majority' prediction
+          L best_class = 0;
+          T best_prob  = 0.0;
+          for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
+            if (row_prediction[k] > best_prob) {
+              best_class = k;
+              best_prob  = row_prediction[k];
+            }
+          }
 
-        h_predictions[row_id] = best_class;
-      } else {
-        h_predictions[row_id] = row_prediction[0];
+          h_predictions[row_id] = best_class;
+        } else {
+          h_predictions[row_id] = row_prediction[0];
+        }
       }
-    }
 
-    raft::update_device(predictions, h_predictions.data(), n_rows, stream);
-    user_handle.sync_stream(stream);
-  }
-
-  /**
-   * @brief Predict target feature for input data and score against ref_labels.
-   * @param[in] user_handle: raft::handle_t.
-   * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU pointer.
-   * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer.
-   * @param[in] n_rows: number of  data samples.
-   * @param[in] n_cols: number of features (excluding target feature).
-   * @param[in] predictions: n_rows predicted labels. GPU pointer, user allocated.
-   * @param[in] verbosity: verbosity level for logging messages during execution
-   * @param[in] rf_type: task type: 0 for classification, 1 for regression
-   */
-  static RF_metrics score(const raft::handle_t& user_handle,
-                          const L* ref_labels,
-                          int n_rows,
-                          const L* predictions,
-                          int verbosity,
-                          int rf_type = RF_type::CLASSIFICATION)
-  {
-    ML::Logger::get().setLevel(verbosity);
-    cudaStream_t stream = user_handle.get_stream();
-    RF_metrics stats;
-    if (rf_type == RF_type::CLASSIFICATION) {  // task classifiation: get classification metrics
-      float accuracy = raft::stats::accuracy(predictions, ref_labels, n_rows, stream);
-      stats          = set_rf_metrics_classification(accuracy);
-      if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
-
-      /* TODO: Potentially augment RF_metrics w/ more metrics (e.g., precision, F1, etc.).
-        For non binary classification problems (i.e., one target and  > 2 labels), need avg.
-        for each of these metrics */
-    } else {  // regression task: get regression metrics
-      double mean_abs_error, mean_squared_error, median_abs_error;
-      raft::stats::regression_metrics(predictions,
-                                      ref_labels,
-                                      n_rows,
-                                      stream,
-                                      mean_abs_error,
-                                      mean_squared_error,
-                                      median_abs_error);
-      stats = set_rf_metrics_regression(mean_abs_error, mean_squared_error, median_abs_error);
-      if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
+      raft::update_device(predictions, h_predictions.data(), n_rows, stream);
+      user_handle.sync_stream(stream);
     }
 
-    return stats;
-  }
-};
+    /**
+     * @brief Predict target feature for input data and score against ref_labels.
+     * @param[in] user_handle: raft::handle_t.
+     * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
+     * pointer.
+     * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer.
+     * @param[in] n_rows: number of  data samples.
+     * @param[in] n_cols: number of features (excluding target feature).
+     * @param[in] predictions: n_rows predicted labels. GPU pointer, user allocated.
+     * @param[in] verbosity: verbosity level for logging messages during execution
+     * @param[in] rf_type: task type: 0 for classification, 1 for regression
+     */
+    static RF_metrics score(const raft::handle_t& user_handle,
+                            const L* ref_labels,
+                            int n_rows,
+                            const L* predictions,
+                            int verbosity,
+                            int rf_type = RF_type::CLASSIFICATION)
+    {
+      ML::Logger::get().setLevel(verbosity);
+      cudaStream_t stream = user_handle.get_stream();
+      RF_metrics stats;
+      if (rf_type == RF_type::CLASSIFICATION) {  // task classifiation: get classification metrics
+        float accuracy = raft::stats::accuracy(predictions, ref_labels, n_rows, stream);
+        stats          = set_rf_metrics_classification(accuracy);
+        if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
+
+        /* TODO: Potentially augment RF_metrics w/ more metrics (e.g., precision, F1, etc.).
+          For non binary classification problems (i.e., one target and  > 2 labels), need avg.
+          for each of these metrics */
+      } else {  // regression task: get regression metrics
+        double mean_abs_error, mean_squared_error, median_abs_error;
+        raft::stats::regression_metrics(predictions,
+                                        ref_labels,
+                                        n_rows,
+                                        stream,
+                                        mean_abs_error,
+                                        mean_squared_error,
+                                        median_abs_error);
+        stats = set_rf_metrics_regression(mean_abs_error, mean_squared_error, median_abs_error);
+        if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
+      }
+
+      return stats;
+    }
+  };
 
-// class specializations
-template class RandomForest<float, int>;
-template class RandomForest<float, float>;
-template class RandomForest<double, int>;
-template class RandomForest<double, double>;
+  // class specializations
+  template class RandomForest<float, int>;
+  template class RandomForest<float, float>;
+  template class RandomForest<double, int>;
+  template class RandomForest<double, double>;
 
 }  // End namespace ML
diff --git a/cpp/src/tsne/tsne.cu b/cpp/src/tsne/tsne.cu
index 3b95994ce9..5432910e1a 100644
--- a/cpp/src/tsne/tsne.cu
+++ b/cpp/src/tsne/tsne.cu
@@ -22,65 +22,65 @@
 #include <raft/distance/distance_type.hpp>
 >>>>>>> branch-22.10
 
-namespace ML {
-
-template <typename tsne_input, typename value_idx, typename value_t>
-value_t _fit(const raft::handle_t& handle,
-             tsne_input& input,
-             knn_graph<value_idx, value_t>& k_graph,
-             TSNEParams& params)
+namespace ML
 {
-  TSNE_runner<tsne_input, value_idx, value_t> runner(handle, input, k_graph, params);
+  template <typename tsne_input, typename value_idx, typename value_t>
+  value_t _fit(const raft::handle_t& handle,
+               tsne_input& input,
+               knn_graph<value_idx, value_t>& k_graph,
+               TSNEParams& params)
+  {
+    TSNE_runner<tsne_input, value_idx, value_t> runner(handle, input, k_graph, params);
 
-  return runner.run();  // returns the Kullback–Leibler divergence
-}
+    return runner.run();  // returns the Kullback–Leibler divergence
+  }
 
-void TSNE_fit(const raft::handle_t& handle,
-              float* X,
-              float* Y,
-              int n,
-              int p,
-              int64_t* knn_indices,
-              float* knn_dists,
-              TSNEParams& params,
-              float* kl_div)
-{
-  ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && X != NULL && Y != NULL,
-         "Wrong input args");
+  void TSNE_fit(const raft::handle_t& handle,
+                float* X,
+                float* Y,
+                int n,
+                int p,
+                int64_t* knn_indices,
+                float* knn_dists,
+                TSNEParams& params,
+                float* kl_div)
+  {
+    ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && X != NULL && Y != NULL,
+           "Wrong input args");
 
-  manifold_dense_inputs_t<float> input(X, Y, n, p);
-  knn_graph<int64_t, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
+    manifold_dense_inputs_t<float> input(X, Y, n, p);
+    knn_graph<int64_t, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
 
-  float kl_div_v = _fit<manifold_dense_inputs_t<float>, knn_indices_dense_t, float>(
-    handle, input, k_graph, params);
+    float kl_div_v = _fit<manifold_dense_inputs_t<float>, knn_indices_dense_t, float>(
+      handle, input, k_graph, params);
 
-  if (kl_div) { *kl_div = kl_div_v; }
-}
+    if (kl_div) { *kl_div = kl_div_v; }
+  }
 
-void TSNE_fit_sparse(const raft::handle_t& handle,
-                     int* indptr,
-                     int* indices,
-                     float* data,
-                     float* Y,
-                     int nnz,
-                     int n,
-                     int p,
-                     int* knn_indices,
-                     float* knn_dists,
-                     TSNEParams& params,
-                     float* kl_div)
-{
-  ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && indptr != NULL &&
-           indices != NULL && data != NULL && Y != NULL,
-         "Wrong input args");
+  void TSNE_fit_sparse(const raft::handle_t& handle,
+                       int* indptr,
+                       int* indices,
+                       float* data,
+                       float* Y,
+                       int nnz,
+                       int n,
+                       int p,
+                       int* knn_indices,
+                       float* knn_dists,
+                       TSNEParams& params,
+                       float* kl_div)
+  {
+    ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && indptr != NULL &&
+             indices != NULL && data != NULL && Y != NULL,
+           "Wrong input args");
 
-  manifold_sparse_inputs_t<int, float> input(indptr, indices, data, Y, nnz, n, p);
-  knn_graph<int, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
+    manifold_sparse_inputs_t<int, float> input(indptr, indices, data, Y, nnz, n, p);
+    knn_graph<int, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
 
-  float kl_div_v = _fit<manifold_sparse_inputs_t<int, float>, knn_indices_sparse_t, float>(
-    handle, input, k_graph, params);
+    float kl_div_v = _fit<manifold_sparse_inputs_t<int, float>, knn_indices_sparse_t, float>(
+      handle, input, k_graph, params);
 
-  if (kl_div) { *kl_div = kl_div_v; }
-}
+    if (kl_div) { *kl_div = kl_div_v; }
+  }
 
 }  // namespace ML
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index 254115c3eb..b8bacf6945 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -39,7 +39,7 @@ namespace UMAPAlgo {
 namespace kNNGraph {
 namespace Algo {
 
-/**
+  /**
  * Initial implementation calls out to FAISS to do its work.
  */
 
@@ -177,4 +177,5 @@ inline void launcher(const raft::handle_t& handle,
 
 }  // namespace Algo
 }  // namespace kNNGraph
-};  // namespace UMAPAlgo
+}
+;  // namespace UMAPAlgo
diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
index 4cb0b1ee97..537586fbfd 100644
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ b/cpp/src_prims/matrix/grammatrix.cuh
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include <raft/distance/specializations.cuh>
 #include <raft/distance/distance.cuh>
+#include <raft/distance/specializations.cuh>
 
 // #TODO: Replace with public header when ready
 #include <raft/linalg/detail/cublas_wrappers.hpp>

From e1fb7b7aafc6ae8d7628de1860a13b728eee900f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 14 Oct 2022 19:58:13 -0400
Subject: [PATCH 24/38] Fixing sv test

---
 cpp/CMakeLists.txt                      |   6 +-
 cpp/bench/CMakeLists.txt                |  45 +--
 cpp/bench/prims/gram_matrix.cu          | 138 ---------
 cpp/bench/prims/main.cpp                |  19 --
 cpp/bench/sg/svc.cu                     |   4 +
 cpp/bench/sg/svr.cu                     |   5 +
 cpp/cmake/thirdparty/get_raft.cmake     |   4 +-
 cpp/include/cuml/matrix/kernelparams.h  |  21 +-
 cpp/src/svm/kernelcache.cuh             |  20 +-
 cpp/src/svm/linear.cu                   |  10 +-
 cpp/src/svm/results.cuh                 |   8 +-
 cpp/src/svm/smo_sets.cuh                |   2 +-
 cpp/src/svm/smoblocksolve.cuh           |   2 +-
 cpp/src/svm/smosolver.cuh               |  18 +-
 cpp/src/svm/svc.cu                      |   8 +-
 cpp/src/svm/svc_impl.cuh                |  20 +-
 cpp/src/svm/svm_api.cpp                 |  18 +-
 cpp/src/svm/svr.cu                      |   6 +-
 cpp/src/svm/svr_impl.cuh                |   2 +-
 cpp/src/svm/workingset.cuh              |   8 +-
 cpp/src/tsne/fft_tsne.cuh               |  49 ++-
 cpp/src_prims/label/merge_labels.cuh    |   4 +-
 cpp/src_prims/matrix/grammatrix.cuh     | 222 --------------
 cpp/src_prims/matrix/kernelfactory.cuh  |  50 ----
 cpp/src_prims/matrix/kernelmatrices.cuh | 381 ------------------------
 cpp/src_prims/selection/processing.cuh  | 226 --------------
 cpp/test/sg/svc_test.cu                 |  51 ++--
 27 files changed, 135 insertions(+), 1212 deletions(-)
 delete mode 100644 cpp/bench/prims/gram_matrix.cu
 delete mode 100644 cpp/bench/prims/main.cpp
 delete mode 100644 cpp/src_prims/matrix/grammatrix.cuh
 delete mode 100644 cpp/src_prims/matrix/kernelfactory.cuh
 delete mode 100644 cpp/src_prims/matrix/kernelmatrices.cuh
 delete mode 100644 cpp/src_prims/selection/processing.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e1b5e8761f..4cff817468 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -56,7 +56,6 @@ option(BUILD_CUML_MG_TESTS "Build cuML multigpu algorithm tests" OFF)
 option(BUILD_PRIMS_TESTS "Build ml-prim tests" ON)
 option(BUILD_CUML_EXAMPLES "Build C++ API usage examples" ON)
 option(BUILD_CUML_BENCH "Build cuML C++ benchmark tests" ON)
-option(BUILD_CUML_PRIMS_BENCH "Build ml-prims C++ benchmark tests" ON)
 option(BUILD_CUML_STD_COMMS "Build the standard NCCL+UCX Communicator" ON)
 option(BUILD_CUML_MPI_COMMS "Build the MPI+NCCL Communicator (used for testing)" OFF)
 option(CUDA_ENABLE_KERNEL_INFO "Enable kernel resource usage info" OFF)
@@ -80,7 +79,6 @@ message(VERBOSE "CUML_CPP: Building cuML multigpu algorithm tests: ${BUILD_CUML_
 message(VERBOSE "CUML_CPP: Building ml-prims tests: ${BUILD_PRIMS_TESTS}")
 message(VERBOSE "CUML_CPP: Building C++ API usage examples: ${BUILD_CUML_EXAMPLES}")
 message(VERBOSE "CUML_CPP: Building cuML C++ benchmark tests: ${BUILD_CUML_BENCH}")
-message(VERBOSE "CUML_CPP: Building ml-prims C++ benchmark tests: ${BUILD_CUML_PRIMS_BENCH}")
 message(VERBOSE "CUML_CPP: Building the standard NCCL+UCX Communicator: ${BUILD_CUML_STD_COMMS}")
 message(VERBOSE "CUML_CPP: Building the MPI+NCCL Communicator (used for testing): ${BUILD_CUML_MPI_COMMS}")
 message(VERBOSE "CUML_CPP: Enabling detection of conda environment for dependencies: ${DETECT_CONDA_ENV}")
@@ -244,7 +242,7 @@ if(BUILD_CUML_TESTS OR BUILD_PRIMS_TESTS)
   include(cmake/thirdparty/get_gtest.cmake)
 endif()
 
-if(BUILD_CUML_BENCH OR BUILD_CUML_PRIMS_BENCH)
+if(BUILD_CUML_BENCH)
   include(cmake/thirdparty/get_gbench.cmake)
 endif()
 
@@ -692,7 +690,7 @@ rapids_export(BUILD cuml
 ##############################################################################
 # - build benchmark executable -----------------------------------------------
 
-if(BUILD_CUML_BENCH OR BUILD_CUML_PRIMS_BENCH)
+if(BUILD_CUML_BENCH)
   add_subdirectory(bench)
 endif()
 
diff --git a/cpp/bench/CMakeLists.txt b/cpp/bench/CMakeLists.txt
index 45b6b71346..7fbb3da70b 100644
--- a/cpp/bench/CMakeLists.txt
+++ b/cpp/bench/CMakeLists.txt
@@ -66,47 +66,4 @@ if(BUILD_CUML_BENCH)
     DESTINATION bin/benchmarks/libcuml
     EXCLUDE_FROM_ALL
   )
-endif()
-
-##############################################################################
-# - build prims bench executable ----------------------------------------------
-
-if(BUILD_CUML_PRIMS_BENCH)
-  # (please keep the filenames in alphabetical order)
-  add_executable(${PRIMS_BENCH_TARGET}
-    prims/gram_matrix.cu
-    prims/main.cpp)
-
-  target_compile_options(${PRIMS_BENCH_TARGET}
-        PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CUML_CXX_FLAGS}>"
-                "$<$<COMPILE_LANGUAGE:CUDA>:${CUML_CUDA_FLAGS}>"
-  )
-
-  target_link_libraries(${PRIMS_BENCH_TARGET}
-    PUBLIC
-      cuml::${CUML_CPP_TARGET}
-      benchmark::benchmark
-      ${TREELITE_LIBS}
-      raft::raft
-      raft::nn
-      raft::distance
-  )
-
-  target_include_directories(${PRIMS_BENCH_TARGET}
-    PRIVATE
-      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../src_prims>
-      $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
-  )
-
-  set_target_properties(
-    ${PRIMS_BENCH_TARGET}
-    PROPERTIES INSTALL_RPATH "\$ORIGIN/../../../lib"
-  )
-
-  install(
-    TARGETS ${PRIMS_BENCH_TARGET}
-    COMPONENT testing
-    DESTINATION bin/benchmarks/libcuml_prims
-    EXCLUDE_FROM_ALL
-  )
-endif()
+endif()
\ No newline at end of file
diff --git a/cpp/bench/prims/gram_matrix.cu b/cpp/bench/prims/gram_matrix.cu
deleted file mode 100644
index fec6cd1b8e..0000000000
--- a/cpp/bench/prims/gram_matrix.cu
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <common/ml_benchmark.hpp>
-#include <cuml/matrix/kernelparams.h>
-#include <matrix/grammatrix.cuh>
-#include <matrix/kernelfactory.cuh>
-#include <memory>
-// #TODO: Replace with public header when ready
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/random/rng.cuh>
-#include <sstream>
-#include <string>
-#include <vector>
-
-namespace MLCommon {
-namespace Bench {
-namespace Matrix {
-
-using namespace MLCommon::Matrix;
-
-struct GramTestParams {
-  int m;  // m parameter of the GEMM
-  int k;  // k parameter of the GEMM
-  int n;  // n parameter of the GEMM
-  KernelParams kernel_params;
-  bool is_row_major;
-};  // struct GramTestParams
-
-template <typename T>
-struct GramMatrix : public Fixture {
-  GramMatrix(const std::string& name, const GramTestParams& p)
-    : Fixture(name), params(p), A(0, stream), B(0, stream), C(0, stream)
-  {
-    std::vector<std::string> kernel_names{"linear", "poly", "rbf", "tanh"};
-    std::ostringstream oss;
-    oss << name << "/" << kernel_names[p.kernel_params.kernel] << "/" << p.m << "x" << p.k << "x"
-        << p.n << "/" << (p.is_row_major ? "row_major" : "col_major");
-    this->SetName(oss.str().c_str());
-
-    RAFT_CUBLAS_TRY(cublasCreate(&cublas_handle));
-    kernel =
-      std::unique_ptr<GramMatrixBase<T>>(KernelFactory<T>::create(p.kernel_params, cublas_handle));
-  }
-
-  ~GramMatrix() { RAFT_CUBLAS_TRY_NO_THROW(cublasDestroy(cublas_handle)); }
-
- protected:
-  void allocateBuffers(const ::benchmark::State& state) override
-  {
-    A.resize(params.m * params.k, stream);
-    B.resize(params.k * params.n, stream);
-    C.resize(params.m * params.n, stream);
-    raft::random::Rng r(123456ULL);
-    r.uniform(A.data(), params.m * params.k, T(-1.0), T(1.0), stream);
-    r.uniform(B.data(), params.k * params.n, T(-1.0), T(1.0), stream);
-  }
-  void deallocateBuffers(const ::benchmark::State& state) override
-  {
-    A.release();
-    B.release();
-    C.release();
-  }
-  void runBenchmark(::benchmark::State& state) override
-  {
-    if (!this->kernel) { state.SkipWithError("Kernel matrix is not initialized"); }
-    loopOnState(state, [this]() {
-      (*this->kernel)(A.data(),
-                      this->params.m,
-                      this->params.k,
-                      B.data(),
-                      this->params.n,
-                      C.data(),
-                      this->params.is_row_major,
-                      this->stream);
-    });
-  }
-
- private:
-  cublasHandle_t cublas_handle;
-  std::unique_ptr<GramMatrixBase<T>> kernel;
-  GramTestParams params;
-
-  rmm::device_uvector<T> A;  // input matrix A, size [m * k]
-  rmm::device_uvector<T> B;  // input matrix B, size [n * k]
-  rmm::device_uvector<T> C;  // output matrix C, size [m*n]
-};
-
-static std::vector<GramTestParams> getInputs()
-{
-  std::vector<GramTestParams> param_vec;
-  std::vector<KernelParams> kernel_params{KernelParams{LINEAR, 3, 1, 0},
-                                          KernelParams{POLYNOMIAL, 2, 1.3, 1},
-                                          KernelParams{TANH, 2, 0.5, 2.4},
-                                          KernelParams{RBF, 2, 0.5, 0}};
-  struct TestSize {
-    int m;
-    int k;
-    int n;
-  };
-  std::vector<TestSize> data_size{{4096, 10, 1024},
-                                  {4096, 100, 1024},
-                                  {4096, 1000, 1024},
-                                  {4096, 10000, 1024},
-                                  {100000, 10, 1024},
-                                  {100000, 100, 1024},
-                                  {100000, 1000, 1024}};
-
-  param_vec.reserve(kernel_params.size() * data_size.size());
-  for (TestSize s : data_size) {
-    for (auto kernel : kernel_params) {
-      for (bool row_major : {false, true}) {
-        param_vec.push_back(GramTestParams{s.m, s.k, s.n, kernel, row_major});
-      }
-    }
-  }
-  return param_vec;
-}
-
-ML_BENCH_REGISTER(GramTestParams, GramMatrix<float>, "", getInputs());
-ML_BENCH_REGISTER(GramTestParams, GramMatrix<double>, "", getInputs());
-
-}  // namespace Matrix
-}  // namespace Bench
-}  // namespace MLCommon
diff --git a/cpp/bench/prims/main.cpp b/cpp/bench/prims/main.cpp
deleted file mode 100644
index 203a8b57d5..0000000000
--- a/cpp/bench/prims/main.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <benchmark/benchmark.h>  // NOLINT
-
-BENCHMARK_MAIN();
diff --git a/cpp/bench/sg/svc.cu b/cpp/bench/sg/svc.cu
index ea4037a822..8b8b391ece 100644
--- a/cpp/bench/sg/svc.cu
+++ b/cpp/bench/sg/svc.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include "benchmark.cuh"
 #include <cmath>
 #include <cuml/matrix/kernelparams.h>
diff --git a/cpp/bench/sg/svr.cu b/cpp/bench/sg/svr.cu
index 71b0b123b8..cf9996888a 100644
--- a/cpp/bench/sg/svr.cu
+++ b/cpp/bench/sg/svr.cu
@@ -14,6 +14,11 @@
  * limitations under the License.
  */
 
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include "benchmark.cuh"
 #include <cmath>
 #include <cuml/matrix/kernelparams.h>
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 59be6b9002..288463f6ec 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -81,8 +81,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-                        FORK             rapidsai
-                        PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
+                        FORK             cjnolet
+                        PINNED_TAG       imp-2212-move_grammatrix #branch-${CUML_BRANCH_VERSION_raft}
 
                         # When PINNED_TAG above doesn't match cuml,
                         # force local raft clone in build directory
diff --git a/cpp/include/cuml/matrix/kernelparams.h b/cpp/include/cuml/matrix/kernelparams.h
index c00ac53dd8..7f59547215 100644
--- a/cpp/include/cuml/matrix/kernelparams.h
+++ b/cpp/include/cuml/matrix/kernelparams.h
@@ -16,26 +16,13 @@
 
 #pragma once
 
+#include <raft/distance/distance_types.hpp>
+
 namespace MLCommon {
 namespace Matrix {
 
-enum KernelType { LINEAR, POLYNOMIAL, RBF, TANH };
-
-/**
- * Parameters for kernel matrices.
- * The following kernels are implemented:
- * - LINEAR \f[ K(x_1,x_2) = <x_1,x_2>, \f] where \f$< , >\f$ is the dot product
- * - POLYNOMIAL \f[ K(x_1, x_2) = (\gamma <x_1,x_2> + \mathrm{coef0})^\mathrm{degree} \f]
- * - RBF \f[ K(x_1, x_2) = \exp(- \gamma |x_1-x_2|^2) \f]
- * - TANH \f[ K(x_1, x_2) = \tanh(\gamma <x_1,x_2> + \mathrm{coef0}) \f]
- */
-struct KernelParams {
-  // Kernel function parameters
-  KernelType kernel;  //!< Type of the kernel function
-  int degree;         //!< Degree of polynomial kernel (ignored by others)
-  double gamma;       //!< multiplier in the
-  double coef0;       //!< additive constant in poly and tanh kernels
-};
+    using raft::distance::KernelType;
+    using raft::distance::KernelParams;
 
 };  // end namespace Matrix
 };  // end namespace MLCommon
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index 68f08d27a2..5b75ac621f 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -18,15 +18,15 @@
 
 #include <cuml/svm/svm_parameter.h>
 
-#include <cache/cache.cuh>
-#include <cache/cache_util.cuh>
-#include <linalg/init.h>
-#include <matrix/grammatrix.cuh>
+#include <raft/distance/kernels.cuh>
+#include <raft/linalg/init.cuh>
+#include <raft/util/cache.cuh>
+#include <raft/util/cache_util.cuh>
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/matrix/matrix.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -105,7 +105,7 @@ class KernelCache {
               int n_rows,
               int n_cols,
               int n_ws,
-              MLCommon::Matrix::GramMatrixBase<math_t>* kernel,
+              raft::distance::kernels::GramMatrixBase<math_t>* kernel,
               float cache_size = 200,
               SvmType svmType  = C_SVC)
     : cache(handle.get_stream(), n_rows, cache_size),
@@ -137,7 +137,7 @@ class KernelCache {
     x_ws.resize(x_ws_tile_size, handle.get_stream());
 
     // Default kernel_column_idx map for SVC
-    MLCommon::LinAlg::range(k_col_idx.data(), n_ws, stream);
+    raft::linalg::range(k_col_idx.data(), n_ws, stream);
 
     // Init cub buffers
     std::size_t bytes1{};
@@ -325,13 +325,13 @@ class KernelCache {
 
   cublasHandle_t cublas_handle;
 
-  MLCommon::Matrix::GramMatrixBase<math_t>* kernel;
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel;
 
   const raft::handle_t handle;
 
   const int TPB = 256;  //!< threads per block for kernels launched
 
-  MLCommon::Cache::Cache<math_t> cache;
+  raft::util::cache::Cache<math_t> cache;
 
   cudaStream_t stream;
   SvmType svmType;
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index 9a8c751b5b..7c1049c334 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -17,15 +17,18 @@
 #include <random>
 #include <type_traits>
 
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include <common/nvtx.hpp>
 #include <cublas_v2.h>
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
-#include <label/classlabels.cuh>
-#include <matrix/kernelfactory.cuh>
 #include <omp.h>
 #include <raft/core/nvtx.hpp>
-#include <raft/cuda_utils.cuh>
+#include <raft/distance/kernels.cuh>
+#include <raft/label/classlabels.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/gemv.cuh>
 #include <raft/linalg/map.cuh>
@@ -33,6 +36,7 @@
 #include <raft/linalg/transpose.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <rmm/device_uvector.hpp>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
diff --git a/cpp/src/svm/results.cuh b/cpp/src/svm/results.cuh
index 0d638d3f2a..8972fa4d3e 100644
--- a/cpp/src/svm/results.cuh
+++ b/cpp/src/svm/results.cuh
@@ -20,16 +20,16 @@
 #include <limits>
 #include <math.h>
 #include <memory>
-#include <raft/cuda_utils.cuh>
 
 #include "ws_util.cuh"
 #include <cub/device/device_select.cuh>
-#include <linalg/init.h>
-#include <raft/core/cudart_utils.hpp>
 #include <raft/linalg/add.cuh>
+#include <raft/linalg/init.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -84,7 +84,7 @@ class Results {
       flag(n_train, stream)
   {
     InitCubBuffers();
-    MLCommon::LinAlg::range(f_idx.data(), n_train, stream);
+    raft::linalg::range(f_idx.data(), n_train, stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 
diff --git a/cpp/src/svm/smo_sets.cuh b/cpp/src/svm/smo_sets.cuh
index 459273950d..f9fd7bf484 100644
--- a/cpp/src/svm/smo_sets.cuh
+++ b/cpp/src/svm/smo_sets.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/cuda_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
 
 namespace ML {
 namespace SVM {
diff --git a/cpp/src/svm/smoblocksolve.cuh b/cpp/src/svm/smoblocksolve.cuh
index b4b685b295..0ae00e8840 100644
--- a/cpp/src/svm/smoblocksolve.cuh
+++ b/cpp/src/svm/smoblocksolve.cuh
@@ -20,7 +20,7 @@
 
 #include "smo_sets.cuh"
 #include <cuml/svm/svm_parameter.h>
-#include <raft/cuda_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <selection/kselection.cuh>
 #include <stdlib.h>
 
diff --git a/cpp/src/svm/smosolver.cuh b/cpp/src/svm/smosolver.cuh
index af8674ec08..a76f0423f8 100644
--- a/cpp/src/svm/smosolver.cuh
+++ b/cpp/src/svm/smosolver.cuh
@@ -17,10 +17,6 @@
 #pragma once
 
 #include <cuml/common/logger.hpp>
-#include <cuml/matrix/kernelparams.h>
-
-#include <matrix/grammatrix.cuh>
-#include <matrix/kernelfactory.cuh>
 
 // #TODO: Replace with public header when ready
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -29,8 +25,8 @@
 
 #include <iostream>
 #include <limits>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <string>
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
@@ -44,10 +40,8 @@
 #include "smoblocksolve.cuh"
 #include "workingset.cuh"
 #include "ws_util.cuh"
-#include <cuml/common/logger.hpp>
-#include <cuml/matrix/kernelparams.h>
-#include <matrix/grammatrix.cuh>
-#include <matrix/kernelfactory.cuh>
+#include <raft/distance/distance_types.hpp>
+#include <raft/distance/kernels.cuh>
 #include <raft/linalg/gemv.cuh>
 #include <raft/linalg/unary_op.cuh>
 
@@ -86,7 +80,7 @@ class SmoSolver {
  public:
   SmoSolver(const raft::handle_t& handle,
             SvmParameter param,
-            MLCommon::Matrix::GramMatrixBase<math_t>* kernel)
+            raft::distance::kernels::GramMatrixBase<math_t>* kernel)
     : handle(handle),
       C(param.C),
       tol(param.tol),
@@ -417,7 +411,7 @@ class SmoSolver {
   math_t tol;      //!< tolerance for stopping condition
   math_t epsilon;  //!< epsilon parameter for epsiolon-SVR
 
-  MLCommon::Matrix::GramMatrixBase<math_t>* kernel;
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel;
   float cache_size;  //!< size of kernel cache in MiB
 
   SvmType svmType;  ///!< Type of the SVM problem to solve
diff --git a/cpp/src/svm/svc.cu b/cpp/src/svm/svc.cu
index 95072d7c61..37746ad599 100644
--- a/cpp/src/svm/svc.cu
+++ b/cpp/src/svm/svc.cu
@@ -16,12 +16,16 @@
 
 #include <iostream>
 
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include "kernelcache.cuh"
 #include "smosolver.cuh"
 #include "svc_impl.cuh"
 #include <cuml/svm/svc.hpp>
-#include <label/classlabels.cuh>
-#include <matrix/kernelfactory.cuh>
+#include <raft/distance/kernels.cuh>
+#include <raft/label/classlabels.cuh>
 #include <raft/linalg/unary_op.cuh>
 
 namespace ML {
diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
index 77751d9063..f1baf827c9 100644
--- a/cpp/src/svm/svc_impl.cuh
+++ b/cpp/src/svm/svc_impl.cuh
@@ -28,8 +28,7 @@
 #include <cublas_v2.h>
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
-#include <label/classlabels.cuh>
-#include <matrix/kernelfactory.cuh>
+#include <raft/distance/kernels.cuh>
 #include <raft/label/classlabels.cuh>
 // #TODO: Replace with public header when ready
 #include <raft/linalg/detail/cublas_wrappers.hpp>
@@ -51,7 +50,7 @@ void svcFit(const raft::handle_t& handle,
             int n_cols,
             math_t* labels,
             const SvmParameter& param,
-            MLCommon::Matrix::KernelParams& kernel_params,
+            raft::distance::kernels::KernelParams& kernel_params,
             SvmModel<math_t>& model,
             const math_t* sample_weight)
 {
@@ -79,8 +78,9 @@ void svcFit(const raft::handle_t& handle,
   MLCommon::Label::getOvrLabels(
     labels, n_rows, model.unique_labels, model.n_classes, y.data(), 1, stream);
 
-  MLCommon::Matrix::GramMatrixBase<math_t>* kernel =
-    MLCommon::Matrix::KernelFactory<math_t>::create(kernel_params, handle_impl.get_cublas_handle());
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel =
+    raft::distance::kernels::KernelFactory<math_t>::create(kernel_params,
+                                                           handle_impl.get_cublas_handle());
   SmoSolver<math_t> smo(handle_impl, param, kernel);
   smo.Solve(input,
             n_rows,
@@ -102,7 +102,7 @@ void svcPredict(const raft::handle_t& handle,
                 math_t* input,
                 int n_rows,
                 int n_cols,
-                MLCommon::Matrix::KernelParams& kernel_params,
+                raft::distance::kernels::KernelParams& kernel_params,
                 const SvmModel<math_t>& model,
                 math_t* preds,
                 math_t buffer_size,
@@ -134,9 +134,9 @@ void svcPredict(const raft::handle_t& handle,
 
   cublasHandle_t cublas_handle = handle_impl.get_cublas_handle();
 
-  MLCommon::Matrix::GramMatrixBase<math_t>* kernel =
-    MLCommon::Matrix::KernelFactory<math_t>::create(kernel_params, cublas_handle);
-  if (kernel_params.kernel == MLCommon::Matrix::RBF) {
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel =
+    raft::distance::kernels::KernelFactory<math_t>::create(kernel_params, cublas_handle);
+  if (kernel_params.kernel == raft::distance::kernels::RBF) {
     // Temporary buffers for the RBF kernel, see below
     x_rbf.resize(n_batch * n_cols, stream);
     idx.resize(n_batch, stream);
@@ -148,7 +148,7 @@ void svcPredict(const raft::handle_t& handle,
     if (i + n_batch >= n_rows) { n_batch = n_rows - i; }
     math_t* x_ptr = nullptr;
     int ld1       = 0;
-    if (kernel_params.kernel == MLCommon::Matrix::RBF) {
+    if (kernel_params.kernel == raft::distance::kernels::RBF) {
       // The RBF kernel does not support ld parameters (See issue #1172)
       // To come around this limitation, we copy the batch into a temporary
       // buffer.
diff --git a/cpp/src/svm/svm_api.cpp b/cpp/src/svm/svm_api.cpp
index 0025af6cb3..ba7f65e8ac 100644
--- a/cpp/src/svm/svm_api.cpp
+++ b/cpp/src/svm/svm_api.cpp
@@ -17,8 +17,8 @@
 #include <cuml/svm/svm_api.h>
 
 #include <common/cumlHandle.hpp>
-#include <cuml/matrix/kernelparams.h>
 #include <cuml/svm/svc.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <tuple>
 
 extern "C" {
@@ -54,8 +54,8 @@ cumlError_t cumlSpSvcFit(cumlHandle_t handle,
   param.tol            = tol;
   param.verbosity      = verbosity;
 
-  MLCommon::Matrix::KernelParams kernel_param;
-  kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
+  raft::distance::kernels::KernelParams kernel_param;
+  kernel_param.kernel = (raft::distance::kernels::KernelType)kernel;
   kernel_param.degree = degree;
   kernel_param.gamma  = gamma;
   kernel_param.coef0  = coef0;
@@ -128,8 +128,8 @@ cumlError_t cumlDpSvcFit(cumlHandle_t handle,
   param.tol            = tol;
   param.verbosity      = verbosity;
 
-  MLCommon::Matrix::KernelParams kernel_param;
-  kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
+  raft::distance::kernels::KernelParams kernel_param;
+  kernel_param.kernel = (raft::distance::kernels::KernelType)kernel;
   kernel_param.degree = degree;
   kernel_param.gamma  = gamma;
   kernel_param.coef0  = coef0;
@@ -189,8 +189,8 @@ cumlError_t cumlSpSvcPredict(cumlHandle_t handle,
                              float buffer_size,
                              int predict_class)
 {
-  MLCommon::Matrix::KernelParams kernel_param;
-  kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
+  raft::distance::kernels::KernelParams kernel_param;
+  kernel_param.kernel = (raft::distance::kernels::KernelType)kernel;
   kernel_param.degree = degree;
   kernel_param.gamma  = gamma;
   kernel_param.coef0  = coef0;
@@ -243,8 +243,8 @@ cumlError_t cumlDpSvcPredict(cumlHandle_t handle,
                              double buffer_size,
                              int predict_class)
 {
-  MLCommon::Matrix::KernelParams kernel_param;
-  kernel_param.kernel = (MLCommon::Matrix::KernelType)kernel;
+  raft::distance::kernels::KernelParams kernel_param;
+  kernel_param.kernel = (raft::distance::kernels::KernelType)kernel;
   kernel_param.degree = degree;
   kernel_param.gamma  = gamma;
   kernel_param.coef0  = coef0;
diff --git a/cpp/src/svm/svr.cu b/cpp/src/svm/svr.cu
index a8ce1cc477..870061cf71 100644
--- a/cpp/src/svm/svr.cu
+++ b/cpp/src/svm/svr.cu
@@ -16,12 +16,16 @@
 
 #include <iostream>
 
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include "kernelcache.cuh"
 #include "smosolver.cuh"
 #include "svr_impl.cuh"
 #include <cuml/svm/svc.hpp>
 #include <label/classlabels.cuh>
-#include <matrix/kernelfactory.cuh>
+#include <raft/distance/kernels.cuh>
 #include <raft/linalg/unary_op.cuh>
 
 namespace ML {
diff --git a/cpp/src/svm/svr_impl.cuh b/cpp/src/svm/svr_impl.cuh
index ed3f9d9815..b904b8c704 100644
--- a/cpp/src/svm/svr_impl.cuh
+++ b/cpp/src/svm/svr_impl.cuh
@@ -29,7 +29,7 @@
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
 #include <label/classlabels.cuh>
-#include <matrix/kernelfactory.cuh>
+#include <raft/distance/kernels.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
 #include <thrust/copy.h>
diff --git a/cpp/src/svm/workingset.cuh b/cpp/src/svm/workingset.cuh
index f2a3569152..290cca7018 100644
--- a/cpp/src/svm/workingset.cuh
+++ b/cpp/src/svm/workingset.cuh
@@ -22,15 +22,15 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/svm/svm_parameter.h>
 
-#include <linalg/init.h>
+#include <raft/linalg/init.cuh>
 
 #include "smo_sets.cuh"
 #include "ws_util.cuh"
 #include <cub/cub.cuh>
 #include <raft/core/handle.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/unary_op.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <thrust/device_ptr.h>
@@ -462,8 +462,8 @@ class WorkingSet {
 
   void Initialize()
   {
-    MLCommon::LinAlg::range(f_idx.data(), n_train, stream);
-    MLCommon::LinAlg::range(idx.data(), n_ws, stream);
+    raft::linalg::range(f_idx.data(), n_train, stream);
+    raft::linalg::range(idx.data(), n_ws, stream);
   }
 
   /**
diff --git a/cpp/src/tsne/fft_tsne.cuh b/cpp/src/tsne/fft_tsne.cuh
index b47f060e58..2ca7506b01 100644
--- a/cpp/src/tsne/fft_tsne.cuh
+++ b/cpp/src/tsne/fft_tsne.cuh
@@ -28,8 +28,8 @@
 #include <cmath>
 #include <common/device_utils.cuh>
 #include <cufft_utils.h>
-#include <linalg/init.h>
 #include <raft/linalg/eltwise.cuh>
+#include <raft/linalg/init.cuh>
 #include <raft/stats/sum.cuh>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -208,60 +208,60 @@ value_t FFT_TSNE(value_t* VAL,
 #define DB(type, name, size) rmm::device_uvector<type> name(size, stream)
 
   DB(value_t, repulsive_forces_device, n * 2);
-  MLCommon::LinAlg::zero(repulsive_forces_device.data(), repulsive_forces_device.size(), stream);
+  raft::linalg::zero(repulsive_forces_device.data(), repulsive_forces_device.size(), stream);
   DB(value_t, attractive_forces_device, n * 2);
-  MLCommon::LinAlg::zero(attractive_forces_device.data(), attractive_forces_device.size(), stream);
+  raft::linalg::zero(attractive_forces_device.data(), attractive_forces_device.size(), stream);
   DB(value_t, gains_device, n * 2);
   auto gains_device_thrust = thrust::device_pointer_cast(gains_device.data());
   thrust::fill(thrust_policy, gains_device_thrust, gains_device_thrust + (n * 2), 1.0f);
   DB(value_t, old_forces_device, n * 2);
-  MLCommon::LinAlg::zero(old_forces_device.data(), old_forces_device.size(), stream);
+  raft::linalg::zero(old_forces_device.data(), old_forces_device.size(), stream);
   DB(value_t, normalization_vec_device, n);
-  MLCommon::LinAlg::zero(normalization_vec_device.data(), normalization_vec_device.size(), stream);
+  raft::linalg::zero(normalization_vec_device.data(), normalization_vec_device.size(), stream);
   DB(value_idx, point_box_idx_device, n);
   DB(value_t, x_in_box_device, n);
-  MLCommon::LinAlg::zero(x_in_box_device.data(), x_in_box_device.size(), stream);
+  raft::linalg::zero(x_in_box_device.data(), x_in_box_device.size(), stream);
   DB(value_t, y_in_box_device, n);
-  MLCommon::LinAlg::zero(y_in_box_device.data(), y_in_box_device.size(), stream);
+  raft::linalg::zero(y_in_box_device.data(), y_in_box_device.size(), stream);
   DB(value_t, y_tilde_values, total_interpolation_points * n_terms);
-  MLCommon::LinAlg::zero(y_tilde_values.data(), y_tilde_values.size(), stream);
+  raft::linalg::zero(y_tilde_values.data(), y_tilde_values.size(), stream);
   DB(value_t, x_interpolated_values_device, n * n_interpolation_points);
-  MLCommon::LinAlg::zero(
+  raft::linalg::zero(
     x_interpolated_values_device.data(), x_interpolated_values_device.size(), stream);
   DB(value_t, y_interpolated_values_device, n * n_interpolation_points);
-  MLCommon::LinAlg::zero(
+  raft::linalg::zero(
     y_interpolated_values_device.data(), y_interpolated_values_device.size(), stream);
   DB(value_t, potentialsQij_device, n * n_terms);
-  MLCommon::LinAlg::zero(potentialsQij_device.data(), potentialsQij_device.size(), stream);
+  raft::linalg::zero(potentialsQij_device.data(), potentialsQij_device.size(), stream);
   DB(value_t, w_coefficients_device, total_interpolation_points * n_terms);
-  MLCommon::LinAlg::zero(w_coefficients_device.data(), w_coefficients_device.size(), stream);
+  raft::linalg::zero(w_coefficients_device.data(), w_coefficients_device.size(), stream);
   DB(value_t,
      all_interpolated_values_device,
      n_terms * n_interpolation_points * n_interpolation_points * n);
-  MLCommon::LinAlg::zero(
+  raft::linalg::zero(
     all_interpolated_values_device.data(), all_interpolated_values_device.size(), stream);
   DB(value_t, output_values, n_terms * n_interpolation_points * n_interpolation_points * n);
-  MLCommon::LinAlg::zero(output_values.data(), output_values.size(), stream);
+  raft::linalg::zero(output_values.data(), output_values.size(), stream);
   DB(value_t,
      all_interpolated_indices,
      n_terms * n_interpolation_points * n_interpolation_points * n);
-  MLCommon::LinAlg::zero(all_interpolated_indices.data(), all_interpolated_indices.size(), stream);
+  raft::linalg::zero(all_interpolated_indices.data(), all_interpolated_indices.size(), stream);
   DB(value_t, output_indices, n_terms * n_interpolation_points * n_interpolation_points * n);
-  MLCommon::LinAlg::zero(output_indices.data(), output_indices.size(), stream);
+  raft::linalg::zero(output_indices.data(), output_indices.size(), stream);
   DB(value_t, chargesQij_device, n * n_terms);
-  MLCommon::LinAlg::zero(chargesQij_device.data(), chargesQij_device.size(), stream);
+  raft::linalg::zero(chargesQij_device.data(), chargesQij_device.size(), stream);
   DB(value_t, box_lower_bounds_device, 2 * n_total_boxes);
-  MLCommon::LinAlg::zero(box_lower_bounds_device.data(), box_lower_bounds_device.size(), stream);
+  raft::linalg::zero(box_lower_bounds_device.data(), box_lower_bounds_device.size(), stream);
   DB(value_t, kernel_tilde_device, n_fft_coeffs * n_fft_coeffs);
-  MLCommon::LinAlg::zero(kernel_tilde_device.data(), kernel_tilde_device.size(), stream);
+  raft::linalg::zero(kernel_tilde_device.data(), kernel_tilde_device.size(), stream);
   DB(cufftComplex,
      fft_kernel_tilde_device,
      2 * n_interpolation_points_1d * 2 * n_interpolation_points_1d);
   DB(value_t, fft_input, n_terms * n_fft_coeffs * n_fft_coeffs);
-  MLCommon::LinAlg::zero(fft_input.data(), fft_input.size(), stream);
+  raft::linalg::zero(fft_input.data(), fft_input.size(), stream);
   DB(cufftComplex, fft_w_coefficients, n_terms * n_fft_coeffs * (n_fft_coeffs / 2 + 1));
   DB(value_t, fft_output, n_terms * n_fft_coeffs * n_fft_coeffs);
-  MLCommon::LinAlg::zero(fft_output.data(), fft_output.size(), stream);
+  raft::linalg::zero(fft_output.data(), fft_output.size(), stream);
 
   value_t h = 1.0f / n_interpolation_points;
   value_t y_tilde_spacings[n_interpolation_points];
@@ -354,12 +354,11 @@ value_t FFT_TSNE(value_t* VAL,
       exaggeration  = params.late_exaggeration;
     }
 
-    MLCommon::LinAlg::zero(w_coefficients_device.data(), w_coefficients_device.size(), stream);
-    MLCommon::LinAlg::zero(potentialsQij_device.data(), potentialsQij_device.size(), stream);
+    raft::linalg::zero(w_coefficients_device.data(), w_coefficients_device.size(), stream);
+    raft::linalg::zero(potentialsQij_device.data(), potentialsQij_device.size(), stream);
     // IntegrationKernel zeroes this, but if this is removed
     // then FITSNE runs in an indefinite loop
-    MLCommon::LinAlg::zero(
-      attractive_forces_device.data(), attractive_forces_device.size(), stream);
+    raft::linalg::zero(attractive_forces_device.data(), attractive_forces_device.size(), stream);
 
     auto minmax_pair = min_max(Y, n * 2, stream);
     auto min_coord   = minmax_pair.first;
diff --git a/cpp/src_prims/label/merge_labels.cuh b/cpp/src_prims/label/merge_labels.cuh
index d9efc3729e..178b6c692c 100644
--- a/cpp/src_prims/label/merge_labels.cuh
+++ b/cpp/src_prims/label/merge_labels.cuh
@@ -19,7 +19,7 @@
 #include <limits>
 #include <math.h>
 
-#include <linalg/init.h>
+#include <raft/linalg/init.cuh>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/interruptible.hpp>
 #include <raft/cuda_utils.cuh>
@@ -125,7 +125,7 @@ void merge_labels(Index_* labels_a,
 
   // Initialize R. R defines the relabeling rules; after merging the input
   // arrays, label l will be reassigned as R[l-1]+1.
-  MLCommon::LinAlg::range(R, N, stream);
+  raft::linalg::range(R, N, stream);
 
   // We define the label equivalence graph: G = (V, E), where:
   //  - V is the set of unique values from labels_a and labels_b
diff --git a/cpp/src_prims/matrix/grammatrix.cuh b/cpp/src_prims/matrix/grammatrix.cuh
deleted file mode 100644
index 537586fbfd..0000000000
--- a/cpp/src_prims/matrix/grammatrix.cuh
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
-
-// #TODO: Replace with public header when ready
-#include <raft/linalg/detail/cublas_wrappers.hpp>
-#include <raft/linalg/gemm.cuh>
-
-namespace MLCommon {
-namespace Matrix {
-
-/**
- * Base class for general Gram matrices
- * A Gram matrix is the Hermitian matrix of inner probucts G_ik = <x_i, x_k>
- * Here, the  inner product is evaluated for all elements from vectors sets X1,
- * and X2.
- *
- * To be more precise, on exit the output buffer will store:
- * - if is_row_major == true: out[j+k*n1] = <x1_j, x2_k>,
- * - if is_row_major == false: out[j*n2 + k] = <x1_j, x2_k>,
- * where x1_j is the j-th vector from the x1 set and x2_k is the k-th vector
- * from the x2 set.
- */
-template <typename math_t>
-class GramMatrixBase {
-  cublasHandle_t cublas_handle;
-
- public:
-  GramMatrixBase(cublasHandle_t cublas_handle) : cublas_handle(cublas_handle){};
-
-  virtual ~GramMatrixBase(){};
-
-  /** Convenience function to evaluate the Gram matrix for two vector sets.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  virtual void operator()(const math_t* x1,
-                          int n1,
-                          int n_cols,
-                          const math_t* x2,
-                          int n2,
-                          math_t* out,
-                          bool is_row_major,
-                          cudaStream_t stream,
-                          int ld1    = 0,
-                          int ld2    = 0,
-                          int ld_out = 0)
-  {
-    if (ld1 <= 0) { ld1 = is_row_major ? n_cols : n1; }
-    if (ld2 <= 0) { ld2 = is_row_major ? n_cols : n2; }
-    if (ld_out <= 0) { ld_out = is_row_major ? n2 : n1; }
-    evaluate(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  /** Evaluate the Gram matrix for two vector sets using simple dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  virtual void evaluate(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out)
-  {
-    linear(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  // private:
-  // The following methods should be private, they are kept public to avoid:
-  // "error: The enclosing parent function ("distance") for an extended
-  // __device__ lambda cannot have private or protected access within its class"
-
-  /** Calculates the Gram matrix using simple dot product between vector sets.
-   *
-   * out = x1 * x2
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of colums (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  void linear(const math_t* x1,
-              int n1,
-              int n_cols,
-              const math_t* x2,
-              int n2,
-              math_t* out,
-              bool is_row_major,
-              cudaStream_t stream,
-              int ld1,
-              int ld2,
-              int ld_out)
-  {
-    math_t alpha = 1.0;
-    math_t beta  = 0.0;
-    if (is_row_major) {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_T,
-                                                       CUBLAS_OP_N,
-                                                       n2,
-                                                       n1,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x2,
-                                                       ld2,
-                                                       x1,
-                                                       ld1,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    } else {
-      // #TODO: Call from public API when ready
-      RAFT_CUBLAS_TRY(raft::linalg::detail::cublasgemm(cublas_handle,
-                                                       CUBLAS_OP_N,
-                                                       CUBLAS_OP_T,
-                                                       n1,
-                                                       n2,
-                                                       n_cols,
-                                                       &alpha,
-                                                       x1,
-                                                       ld1,
-                                                       x2,
-                                                       ld2,
-                                                       &beta,
-                                                       out,
-                                                       ld_out,
-                                                       stream));
-    }
-  }
-
-  /** Calculates the Gram matrix using Euclidean distance.
-   *
-   * Can be used as a building block for more complex kernel functions.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of columns (features) in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  virtual void distance(const math_t* x1,
-                        int n1,
-                        int n_cols,
-                        const math_t* x2,
-                        int n2,
-                        math_t* out,
-                        bool is_row_major,
-                        cudaStream_t stream,
-                        int ld1,
-                        int ld2,
-                        int ld_out)
-  {
-    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded, math_t, math_t, math_t>(
-      x1, x2, out, n1, n2, n_cols, stream, is_row_major);
-  }
-};
-};  // end namespace Matrix
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/matrix/kernelfactory.cuh b/cpp/src_prims/matrix/kernelfactory.cuh
deleted file mode 100644
index beaeae7a84..0000000000
--- a/cpp/src_prims/matrix/kernelfactory.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "grammatrix.cuh"
-#include "kernelmatrices.cuh"
-#include <cuml/matrix/kernelparams.h>
-#include <raft/core/cudart_utils.hpp>
-
-namespace MLCommon {
-namespace Matrix {
-
-template <typename math_t>
-class KernelFactory {
- public:
-  static GramMatrixBase<math_t>* create(KernelParams params, cublasHandle_t cublas_handle)
-  {
-    GramMatrixBase<math_t>* res;
-    // KernelParams is not templated, we convert the parameters to math_t here:
-    math_t coef0 = params.coef0;
-    math_t gamma = params.gamma;
-    switch (params.kernel) {
-      case LINEAR: res = new GramMatrixBase<math_t>(cublas_handle); break;
-      case POLYNOMIAL:
-        res = new PolynomialKernel<math_t, int>(params.degree, gamma, coef0, cublas_handle);
-        break;
-      case TANH: res = new TanhKernel<math_t>(gamma, coef0, cublas_handle); break;
-      case RBF: res = new RBFKernel<math_t>(gamma); break;
-      default: throw raft::exception("Kernel not implemented");
-    }
-    return res;
-  }
-};
-
-};  // end namespace Matrix
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/matrix/kernelmatrices.cuh b/cpp/src_prims/matrix/kernelmatrices.cuh
deleted file mode 100644
index edaf9c9ad7..0000000000
--- a/cpp/src_prims/matrix/kernelmatrices.cuh
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "grammatrix.cuh"
-#include <raft/cuda_utils.cuh>
-
-#include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
-#include <raft/linalg/gemm.cuh>
-
-namespace MLCommon {
-namespace Matrix {
-
-using namespace MLCommon;
-
-/** Epiloge function for polynomial kernel without padding.
- * Calculates output = (gain*in + offset)^exponent
- * @param inout device vector in column major format, size [len]
- * @param len array length
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel_nopad(
-  math_t* inout, size_t len, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = pow(gain * inout[tid] + offset, exponent);
-  }
-}
-
-/** Epiloge function for polynomial kernel with padding.
- * Calculates output = (gain*input + offset)^exponent
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of colums
- * @param exponent
- * @param gain
- * @param offset
- */
-template <typename math_t, typename exp_t>
-__global__ void polynomial_kernel(
-  math_t* inout, int ld, int rows, int cols, exp_t exponent, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = pow(gain * inout[tidx + tidy * ld] + offset, exponent);
-    }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector, size [len]
- * @param len length of the input vector
- * @param gain
- * @param offset
- */
-template <typename math_t>
-__global__ void tanh_kernel_nopad(math_t* inout, size_t len, math_t gain, math_t offset)
-{
-  for (size_t tid = threadIdx.x + blockIdx.x * blockDim.x; tid < len;
-       tid += blockDim.x * gridDim.x) {
-    inout[tid] = tanh(gain * inout[tid] + offset);
-  }
-}
-
-/** Epiloge function for tanh kernel without padding.
- * Calculates output = tanh(gain*input + offset)
- * @param inout device vector in column major format, size [ld * cols]
- * @param ld leading dimension of the inout buffer
- * @param rows number of rows (rows <= ld)
- * @param cols number of colums
- * @param gain
- * @param offset
- */
-template <typename math_t>
-__global__ void tanh_kernel(math_t* inout, int ld, int rows, int cols, math_t gain, math_t offset)
-{
-  for (size_t tidy = threadIdx.y + blockIdx.y * blockDim.y; tidy < cols;
-       tidy += blockDim.y * gridDim.y)
-    for (size_t tidx = threadIdx.x + blockIdx.x * blockDim.x; tidx < rows;
-         tidx += blockDim.x * gridDim.x) {
-      inout[tidx + tidy * ld] = tanh(gain * inout[tidx + tidy * ld] + offset);
-    }
-}
-
-/**
- * Create a kernel matrix using polynomial kernel function.
- */
-template <typename math_t, typename exp_t>
-class PolynomialKernel : public GramMatrixBase<math_t> {
-  exp_t exponent;
-  math_t gain;
-  math_t offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      polynomial_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, exponent, gain, offset);
-    } else {
-      int n1 = is_row_major ? cols : rows;
-      int n2 = is_row_major ? rows : cols;
-      polynomial_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
-                          dim3(32, 4, 1),
-                          0,
-                          stream>>>(inout, ld, n1, n2, exponent, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a polynomial kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = (gain*<x1_i, x2_k> + offset)^exponent
-   *
-   * @tparam math_t floating point type
-   * @tparam exp_t type of exponent
-   * @param exponent
-   * @param gain
-   * @param offset
-   * @param cublas_handle
-   */
-  PolynomialKernel(exp_t exponent, math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), exponent(exponent), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using polynomial kernel.
-   *
-   * output[i,k] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1
-   * @param ld2 leading dimension of x2
-   * @param ld_out leading dimension of out
-   */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
-  {
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using tanh kernel function.
- */
-template <typename math_t>
-class TanhKernel : public GramMatrixBase<math_t> {
-  math_t gain, offset;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      tanh_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain, offset);
-    } else {
-      int n1 = is_row_major ? cols : rows;
-      int n2 = is_row_major ? rows : cols;
-      tanh_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
-                    dim3(32, 4, 1),
-                    0,
-                    stream>>>(inout, ld, n1, n2, gain, offset);
-    }
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
- public:
-  /**
-   * Constructs a tanh kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = tanh(gain*<x1_i, x2_k> + offset)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   * @param offset
-   * @param cublas_handle
-   */
-  TanhKernel(math_t gain, math_t offset, cublasHandle_t cublas_handle)
-    : GramMatrixBase<math_t>(cublas_handle), gain(gain), offset(offset)
-  {
-  }
-
-  /** Evaluate kernel matrix using tanh kernel.
-   *
-   * output_[i + k*n1] = (gain*<x1_i, x2_k> + offset)^exponent,
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and < , > denotes dot product.
-   *
-   * @param [in] x1 device array of vectors,
-   *  size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors,
-   *   size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1 (usually it is n1)
-   * @param ld2 leading dimension of x2 (usually it is n2)
-   * @param ld_out leading dimension of out (usually it is n1)
-   */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
-  {
-    GramMatrixBase<math_t>::linear(
-      x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-    applyKernel(out, ld_out, n1, n2, is_row_major, stream);
-  }
-};
-
-/**
- * Create a kernel matrix using RBF kernel function.
- */
-template <typename math_t>
-class RBFKernel : public GramMatrixBase<math_t> {
-  math_t gain;
-
-  void applyKernel(
-    math_t* inout, int ld, int rows, int cols, bool is_row_major, cudaStream_t stream)
-  {
-    const int n_minor = is_row_major ? cols : rows;
-    if (ld == n_minor) {
-      rbf_kernel_nopad<<<raft::ceildiv<size_t>((size_t)rows * cols, 128), 128, 0, stream>>>(
-        inout, rows * cols, gain);
-    } else {
-      int n1 = is_row_major ? cols : rows;
-      int n2 = is_row_major ? rows : cols;
-      rbf_kernel<<<dim3(raft::ceildiv(n1, 32), raft::ceildiv(n2, 4), 1),
-                   dim3(32, 4, 1),
-                   0,
-                   stream>>>(inout, ld, n1, n2, gain);
-    }
-  }
-
- public:
-  /**
-   * Constructs a RBF kernel object.
-   * It evaluates the kernel matrix using the following formula:
-   * K_ij = exp(-gain*|x1_i- x2_k|^2)
-   *
-   * @tparam math_t floating point type
-   * @param gain
-   */
-  RBFKernel(math_t gain) : GramMatrixBase<math_t>(NULL), gain(gain) {}
-
-  /** Evaluate kernel matrix using RBF kernel.
-   *
-   * output_[i + k*n1] = exp(-gain*|x1_i - x2_k|^2),
-   * where x1_i is the i-th vector from the x1 set, and x2_k is k-th vector
-   * in the x2 set, and | | euclidean distance.
-   *
-   * @param [in] x1 device array of vectors, size [n1*n_cols]
-   * @param [in] n1 number vectors in x1
-   * @param [in] n_cols number of features in x1 and x2
-   * @param [in] x2 device array of vectors, size [n2*n_cols]
-   * @param [in] n2 number vectors in x2
-   * @param [out] out device buffer to store the Gram matrix, size [n1*n2]
-   * @param [in] is_row_major whether the input and output matrices are in row
-   *        major format
-   * @param [in] stream cuda stream
-   * @param ld1 leading dimension of x1, currently only ld1 == n1 is supported
-   * @param ld2 leading dimension of x2, currently only ld2 == n2 is supported
-   * @param ld_out leading dimension of out, only ld_out == n1 is supported
-   */
-  void evaluate(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
-  {
-    int minor1    = is_row_major ? n_cols : n1;
-    int minor2    = is_row_major ? n_cols : n2;
-    int minor_out = is_row_major ? n2 : n1;
-    ASSERT(ld1 == minor1, "RBF Kernel distance does not support ld1 parameter");
-    ASSERT(ld2 == minor2, "RBF Kernel distance does not support ld2 parameter");
-    ASSERT(ld_out == minor_out, "RBF Kernel distance does not support ld_out parameter");
-    distance(x1, n1, n_cols, x2, n2, out, is_row_major, stream, ld1, ld2, ld_out);
-  }
-
-  /** Customize distance function withe RBF epilogue */
-  void distance(const math_t* x1,
-                int n1,
-                int n_cols,
-                const math_t* x2,
-                int n2,
-                math_t* out,
-                bool is_row_major,
-                cudaStream_t stream,
-                int ld1,
-                int ld2,
-                int ld_out)
-  {
-    math_t gain   = this->gain;
-    using index_t = int64_t;
-
-    auto fin_op = [gain] __device__(math_t d_val, index_t idx) { return exp(-gain * d_val); };
-    raft::distance::distance<raft::distance::DistanceType::L2Unexpanded,
-                             math_t,
-                             math_t,
-                             math_t,
-                             decltype(fin_op),
-                             index_t>(const_cast<math_t*>(x1),
-                                      const_cast<math_t*>(x2),
-                                      out,
-                                      n1,
-                                      n2,
-                                      n_cols,
-                                      NULL,
-                                      0,
-                                      fin_op,
-                                      stream,
-                                      is_row_major);
-  }
-};
-
-};  // end namespace Matrix
-};  // namespace MLCommon
diff --git a/cpp/src_prims/selection/processing.cuh b/cpp/src_prims/selection/processing.cuh
deleted file mode 100644
index 203f44666c..0000000000
--- a/cpp/src_prims/selection/processing.cuh
+++ /dev/null
@@ -1,226 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuml/neighbors/knn.hpp>
-
-#include <raft/linalg/matrix_vector_op.cuh>
-#include <raft/linalg/norm.cuh>
-#include <raft/linalg/unary_op.cuh>
-
-#include <raft/stats/mean.cuh>
-#include <raft/stats/mean_center.cuh>
-
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Selection {
-
-/**
- * @brief A virtual class defining pre- and post-processing
- * for metrics. This class will temporarily modify its given
- * state in `preprocess()` and undo those modifications in
- * `postprocess()`
- */
-
-template <typename math_t>
-class MetricProcessor {
- public:
-  virtual void preprocess(math_t* data) {}
-
-  virtual void revert(math_t* data) {}
-
-  virtual void postprocess(math_t* data) {}
-
-  virtual ~MetricProcessor() = default;
-};
-
-template <typename math_t>
-class CosineMetricProcessor : public MetricProcessor<math_t> {
- protected:
-  int k_;
-  bool row_major_;
-  size_t n_rows_;
-  size_t n_cols_;
-  cudaStream_t stream_;
-  rmm::device_uvector<math_t> colsums_;
-
- public:
-  CosineMetricProcessor(size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : stream_(stream),
-      colsums_(n_rows, stream),
-      n_cols_(n_cols),
-      n_rows_(n_rows),
-      row_major_(row_major),
-      k_(k)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    raft::linalg::rowNorm(colsums_.data(),
-                          data,
-                          n_cols_,
-                          n_rows_,
-                          raft::linalg::NormType::L2Norm,
-                          row_major_,
-                          stream_,
-                          [] __device__(math_t in) { return sqrtf(in); });
-
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in / vec_in; },
-      stream_);
-  }
-
-  void revert(math_t* data)
-  {
-    raft::linalg::matrixVectorOp(
-      data,
-      data,
-      colsums_.data(),
-      n_cols_,
-      n_rows_,
-      row_major_,
-      false,
-      [] __device__(math_t mat_in, math_t vec_in) { return mat_in * vec_in; },
-      stream_);
-  }
-
-  void postprocess(math_t* data)
-  {
-    raft::linalg::unaryOp(
-      data, data, k_ * n_rows_, [] __device__(math_t in) { return 1 - in; }, stream_);
-  }
-
-  ~CosineMetricProcessor() = default;
-};
-
-template <typename math_t>
-class CorrelationMetricProcessor : public CosineMetricProcessor<math_t> {
-  using cosine = CosineMetricProcessor<math_t>;
-
- public:
-  CorrelationMetricProcessor(
-    size_t n_rows, size_t n_cols, int k, bool row_major, cudaStream_t stream)
-    : CosineMetricProcessor<math_t>(n_rows, n_cols, k, row_major, stream), means_(n_rows, stream)
-  {
-  }
-
-  void preprocess(math_t* data)
-  {
-    math_t normalizer_const = 1.0 / (math_t)cosine::n_cols_;
-
-    raft::linalg::reduce(means_.data(),
-                         data,
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         (math_t)0.0,
-                         cosine::row_major_,
-                         true,
-                         cosine::stream_);
-
-    raft::linalg::unaryOp(
-      means_.data(),
-      means_.data(),
-      cosine::n_rows_,
-      [=] __device__(math_t in) { return in * normalizer_const; },
-      cosine::stream_);
-
-    raft::stats::meanCenter(data,
-                            data,
-                            means_.data(),
-                            cosine::n_cols_,
-                            cosine::n_rows_,
-                            cosine::row_major_,
-                            false,
-                            cosine::stream_);
-
-    CosineMetricProcessor<math_t>::preprocess(data);
-  }
-
-  void revert(math_t* data)
-  {
-    CosineMetricProcessor<math_t>::revert(data);
-
-    raft::stats::meanAdd(data,
-                         data,
-                         means_.data(),
-                         cosine::n_cols_,
-                         cosine::n_rows_,
-                         cosine::row_major_,
-                         false,
-                         cosine::stream_);
-  }
-
-  void postprocess(math_t* data) { CosineMetricProcessor<math_t>::postprocess(data); }
-
-  ~CorrelationMetricProcessor() = default;
-
-  rmm::device_uvector<math_t> means_;
-};
-
-template <typename math_t>
-class DefaultMetricProcessor : public MetricProcessor<math_t> {
- public:
-  void preprocess(math_t* data) {}
-
-  void revert(math_t* data) {}
-
-  void postprocess(math_t* data) {}
-
-  ~DefaultMetricProcessor() = default;
-};
-
-template <typename math_t>
-inline std::unique_ptr<MetricProcessor<math_t>> create_processor(
-  raft::distance::DistanceType metric,
-  int n,
-  int D,
-  int k,
-  bool rowMajorQuery,
-  cudaStream_t userStream)
-{
-  MetricProcessor<math_t>* mp = nullptr;
-
-  switch (metric) {
-    case raft::distance::DistanceType::CosineExpanded:
-      mp = new CosineMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-
-    case raft::distance::DistanceType::CorrelationExpanded:
-      mp = new CorrelationMetricProcessor<math_t>(n, D, k, rowMajorQuery, userStream);
-      break;
-    default: mp = new DefaultMetricProcessor<math_t>();
-  }
-
-  return std::unique_ptr<MetricProcessor<math_t>>(mp);
-}
-
-// Currently only being used by floats
-template class MetricProcessor<float>;
-template class CosineMetricProcessor<float>;
-template class CorrelationMetricProcessor<float>;
-template class DefaultMetricProcessor<float>;
-
-};  // namespace Selection
-};  // namespace MLCommon
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index 04a0d04200..bbe7c55b3e 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include <cub/cub.cuh>
 #include <cuml/common/logger.hpp>
 #include <cuml/datasets/make_blobs.hpp>
@@ -23,14 +27,13 @@
 #include <cuml/svm/svr.hpp>
 #include <gtest/gtest.h>
 #include <iostream>
-#include <matrix/grammatrix.cuh>
-#include <matrix/kernelmatrices.cuh>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
+#include <raft/distance/kernels.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/map_then_reduce.cuh>
 #include <raft/linalg/transpose.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <string>
 #include <svm/smoblocksolve.cuh>
@@ -49,7 +52,7 @@
 
 namespace ML {
 namespace SVM {
-using namespace MLCommon;
+using namespace raft::distance::kernels;
 using namespace Matrix;
 
 // Initialize device vector C_vec with scalar C
@@ -161,23 +164,23 @@ class KernelCacheTest : public ::testing::Test {
 
  protected:
   // Naive host side kernel implementation used for comparison
-  void ApplyNonlin(Matrix::KernelParams params)
+  void ApplyNonlin(KernelParams params)
   {
     switch (params.kernel) {
-      case Matrix::LINEAR: break;
-      case Matrix::POLYNOMIAL:
+      case LINEAR: break;
+      case POLYNOMIAL:
         for (int z = 0; z < n_rows * n_ws; z++) {
           math_t val            = params.gamma * tile_host_expected[z] + params.coef0;
           tile_host_expected[z] = pow(val, params.degree);
         }
         break;
-      case Matrix::TANH:
+      case TANH:
         for (int z = 0; z < n_rows * n_ws; z++) {
           math_t val            = params.gamma * tile_host_expected[z] + params.coef0;
           tile_host_expected[z] = tanh(val);
         }
         break;
-      case Matrix::RBF:
+      case RBF:
         for (int i = 0; i < n_ws; i++) {
           for (int j = 0; j < n_rows; j++) {
             math_t d = 0;
@@ -236,15 +239,15 @@ TYPED_TEST_CASE_P(KernelCacheTest);
 TYPED_TEST_P(KernelCacheTest, EvalTest)
 {
   auto stream = this->handle.get_stream();
-  std::vector<Matrix::KernelParams> param_vec{Matrix::KernelParams{Matrix::LINEAR, 3, 1, 0},
-                                              Matrix::KernelParams{Matrix::POLYNOMIAL, 2, 1.3, 1},
-                                              Matrix::KernelParams{Matrix::TANH, 2, 0.5, 2.4},
-                                              Matrix::KernelParams{Matrix::RBF, 2, 0.5, 0}};
+  std::vector<KernelParams> param_vec{KernelParams{LINEAR, 3, 1, 0},
+                                      KernelParams{POLYNOMIAL, 2, 1.3, 1},
+                                      KernelParams{TANH, 2, 0.5, 2.4},
+                                      KernelParams{RBF, 2, 0.5, 0}};
   float cache_size = 0;
 
   for (auto params : param_vec) {
-    Matrix::GramMatrixBase<TypeParam>* kernel =
-      Matrix::KernelFactory<TypeParam>::create(params, this->handle.get_cublas_handle());
+    GramMatrixBase<TypeParam>* kernel =
+      KernelFactory<TypeParam>::create(params, this->handle.get_cublas_handle());
     KernelCache<TypeParam> cache(this->handle,
                                  this->x_dev.data(),
                                  this->n_rows,
@@ -267,11 +270,11 @@ TYPED_TEST_P(KernelCacheTest, EvalTest)
 
 TYPED_TEST_P(KernelCacheTest, CacheEvalTest)
 {
-  Matrix::KernelParams param{Matrix::LINEAR, 3, 1, 0};
+  KernelParams param{LINEAR, 3, 1, 0};
   float cache_size = sizeof(TypeParam) * this->n_rows * 32 / (1024.0 * 1024);
 
-  Matrix::GramMatrixBase<TypeParam>* kernel =
-    Matrix::KernelFactory<TypeParam>::create(param, this->handle.get_cublas_handle());
+  GramMatrixBase<TypeParam>* kernel =
+    KernelFactory<TypeParam>::create(param, this->handle.get_cublas_handle());
   KernelCache<TypeParam> cache(this->handle,
                                this->x_dev.data(),
                                this->n_rows,
@@ -290,15 +293,15 @@ TYPED_TEST_P(KernelCacheTest, CacheEvalTest)
 
 TYPED_TEST_P(KernelCacheTest, SvrEvalTest)
 {
-  Matrix::KernelParams param{Matrix::LINEAR, 3, 1, 0};
+  KernelParams param{LINEAR, 3, 1, 0};
   float cache_size = sizeof(TypeParam) * this->n_rows * 32 / (1024.0 * 1024);
 
   this->n_ws        = 6;
   int ws_idx_svr[6] = {0, 5, 1, 4, 3, 7};
   raft::update_device(this->ws_idx_dev.data(), ws_idx_svr, 6, this->stream);
 
-  Matrix::GramMatrixBase<TypeParam>* kernel =
-    Matrix::KernelFactory<TypeParam>::create(param, this->handle.get_cublas_handle());
+  GramMatrixBase<TypeParam>* kernel =
+    KernelFactory<TypeParam>::create(param, this->handle.get_cublas_handle());
   KernelCache<TypeParam> cache(this->handle,
                                this->x_dev.data(),
                                this->n_rows,
@@ -706,7 +709,7 @@ class SmoSolverTest : public ::testing::Test {
     raft::update_device(kernel_dev.data(), kernel_host, n_ws * n_rows, stream);
     RAFT_CUDA_TRY(cudaMemsetAsync(delta_alpha_dev.data(), 0, n_ws * sizeof(math_t), stream));
 
-    kernel = std::make_unique<Matrix::GramMatrixBase<math_t>>(cublas_handle);
+    kernel = std::make_unique<GramMatrixBase<math_t>>(cublas_handle);
   }
 
  public:
@@ -820,7 +823,7 @@ class SmoSolverTest : public ::testing::Test {
   cublasHandle_t cublas_handle;
   cudaStream_t stream = 0;
 
-  std::unique_ptr<Matrix::GramMatrixBase<math_t>> kernel;
+  std::unique_ptr<GramMatrixBase<math_t>> kernel;
   int n_rows       = 6;
   const int n_cols = 2;
   int n_ws         = 6;

From ee4b0e9d4d21ea07862edfd7238fad47b0b1d70f Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 14 Oct 2022 20:07:50 -0400
Subject: [PATCH 25/38] Fixing svm test

---
 cpp/test/sg/linear_svm_test.cu | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/test/sg/linear_svm_test.cu b/cpp/test/sg/linear_svm_test.cu
index 5ce201b83d..26db445de4 100644
--- a/cpp/test/sg/linear_svm_test.cu
+++ b/cpp/test/sg/linear_svm_test.cu
@@ -13,6 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/distance/specializations.cuh>
+#endif
+
 #include <cmath>
 #include <cuml/datasets/make_blobs.hpp>
 #include <cuml/datasets/make_regression.hpp>

From 2f641d6c0e7a16ae00c665e8f6493bca6c96bf92 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 14 Oct 2022 20:22:41 -0400
Subject: [PATCH 26/38] Fixing style

---
 cpp/include/cuml/matrix/kernelparams.h | 2 +-
 cpp/src/svm/smo_sets.cuh               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cuml/matrix/kernelparams.h b/cpp/include/cuml/matrix/kernelparams.h
index 7f59547215..0739d8c733 100644
--- a/cpp/include/cuml/matrix/kernelparams.h
+++ b/cpp/include/cuml/matrix/kernelparams.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/svm/smo_sets.cuh b/cpp/src/svm/smo_sets.cuh
index f9fd7bf484..81a9447087 100644
--- a/cpp/src/svm/smo_sets.cuh
+++ b/cpp/src/svm/smo_sets.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From d05f7544699957640d62f4c8b8c5274bde3ed542 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 14 Oct 2022 20:24:05 -0400
Subject: [PATCH 27/38] Fixing more style

---
 cpp/bench/sg/svr.cu                    | 1 -
 cpp/include/cuml/matrix/kernelparams.h | 4 ++--
 cpp/src_prims/label/merge_labels.cuh   | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/bench/sg/svr.cu b/cpp/bench/sg/svr.cu
index cf9996888a..408dde9fd8 100644
--- a/cpp/bench/sg/svr.cu
+++ b/cpp/bench/sg/svr.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.cuh>
 #endif
diff --git a/cpp/include/cuml/matrix/kernelparams.h b/cpp/include/cuml/matrix/kernelparams.h
index 0739d8c733..5815405938 100644
--- a/cpp/include/cuml/matrix/kernelparams.h
+++ b/cpp/include/cuml/matrix/kernelparams.h
@@ -21,8 +21,8 @@
 namespace MLCommon {
 namespace Matrix {
 
-    using raft::distance::KernelType;
-    using raft::distance::KernelParams;
+using raft::distance::KernelParams;
+using raft::distance::KernelType;
 
 };  // end namespace Matrix
 };  // end namespace MLCommon
diff --git a/cpp/src_prims/label/merge_labels.cuh b/cpp/src_prims/label/merge_labels.cuh
index 178b6c692c..b3c25ef6b0 100644
--- a/cpp/src_prims/label/merge_labels.cuh
+++ b/cpp/src_prims/label/merge_labels.cuh
@@ -19,10 +19,10 @@
 #include <limits>
 #include <math.h>
 
-#include <raft/linalg/init.cuh>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/interruptible.hpp>
 #include <raft/cuda_utils.cuh>
+#include <raft/linalg/init.cuh>
 
 namespace MLCommon {
 namespace Label {

From 5c27e9f60c16403274f4421b5e4776e3438ad167 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Fri, 14 Oct 2022 20:49:02 -0400
Subject: [PATCH 28/38] Removing gram test

---
 cpp/test/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 830405b45c..4082442c35 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -232,14 +232,12 @@ if(BUILD_PRIMS_TESTS)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_GEMV_TEST PATH prims/batched/gemv.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_MAKE_SYMM_TEST PATH prims/batched/make_symm.cu)
   ConfigureTest(PREFIX PRIMS NAME BATCHED_MATRIX_TEST PATH prims/batched/matrix.cu)
-  ConfigureTest(PREFIX PRIMS NAME CACHE_TEST PATH prims/cache.cu)
   ConfigureTest(PREFIX PRIMS NAME DECOUPLED_LOOKBACK_TEST PATH prims/decoupled_lookback.cu)
   ConfigureTest(PREFIX PRIMS NAME DEVICE_UTILS_TEST PATH prims/device_utils.cu)
   ConfigureTest(PREFIX PRIMS NAME ELTWISE2D_TEST PATH prims/eltwise2d.cu)
   ConfigureTest(PREFIX PRIMS NAME FAST_INT_DIV_TEST PATH prims/fast_int_div.cu)
   ConfigureTest(PREFIX PRIMS NAME FILLNA_TEST PATH prims/fillna.cu)
   ConfigureTest(PREFIX PRIMS NAME GATHER_TEST PATH prims/gather.cu)
-  ConfigureTest(PREFIX PRIMS NAME GRAM_TEST PATH prims/gram.cu)
   ConfigureTest(PREFIX PRIMS NAME GRID_SYNC_TEST PATH prims/grid_sync.cu)
   ConfigureTest(PREFIX PRIMS NAME HINGE_TEST PATH prims/hinge.cu)
   ConfigureTest(PREFIX PRIMS NAME JONES_TRANSFORM_TEST PATH prims/jones_transform.cu)

From 1e5c2beaae0c4080f600434f32d993e38c1c99ec Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 15 Oct 2022 11:31:43 -0400
Subject: [PATCH 29/38] Adding knn classify/regress tests back in

---
 cpp/test/prims/knn_classify.cu   | 143 +++++++++++++++++++++++++++
 cpp/test/prims/knn_regression.cu | 165 +++++++++++++++++++++++++++++++
 2 files changed, 308 insertions(+)
 create mode 100644 cpp/test/prims/knn_classify.cu
 create mode 100644 cpp/test/prims/knn_regression.cu

diff --git a/cpp/test/prims/knn_classify.cu b/cpp/test/prims/knn_classify.cu
new file mode 100644
index 0000000000..26c879e974
--- /dev/null
+++ b/cpp/test/prims/knn_classify.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <raft/core/cudart_utils.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/label/classlabels.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/spatial/knn/knn.cuh>
+#include <rmm/device_uvector.hpp>
+#include <selection/knn.cuh>
+#include <vector>
+
+namespace MLCommon {
+namespace Selection {
+
+struct KNNClassifyInputs {
+  int rows;
+  int cols;
+  int n_labels;
+  float cluster_std;
+  int k;
+};
+
+class KNNClassifyTest : public ::testing::TestWithParam<KNNClassifyInputs> {
+ public:
+  KNNClassifyTest()
+    : params(::testing::TestWithParam<KNNClassifyInputs>::GetParam()),
+      stream(handle.get_stream()),
+      train_samples(params.rows * params.cols, stream),
+      train_labels(params.rows, stream),
+      pred_labels(params.rows, stream),
+      knn_indices(params.rows * params.k, stream),
+      knn_dists(params.rows * params.k, stream)
+  {
+    basicTest();
+  }
+
+ protected:
+  void basicTest()
+  {
+    raft::random::make_blobs<float, int>(train_samples.data(),
+                                         train_labels.data(),
+                                         params.rows,
+                                         params.cols,
+                                         params.n_labels,
+                                         stream,
+                                         true,
+                                         nullptr,
+                                         nullptr,
+                                         params.cluster_std);
+
+    rmm::device_uvector<int> unique_labels(0, stream);
+    auto n_classes =
+      raft::label::getUniquelabels(unique_labels, train_labels.data(), params.rows, stream);
+
+    std::vector<float*> ptrs(1);
+    std::vector<int> sizes(1);
+    ptrs[0]  = train_samples.data();
+    sizes[0] = params.rows;
+
+    raft::spatial::knn::brute_force_knn(handle,
+                                        ptrs,
+                                        sizes,
+                                        params.cols,
+                                        train_samples.data(),
+                                        params.rows,
+                                        knn_indices.data(),
+                                        knn_dists.data(),
+                                        params.k);
+
+    std::vector<int*> y;
+    y.push_back(train_labels.data());
+
+    std::vector<int*> uniq_labels;
+    uniq_labels.push_back(unique_labels.data());
+
+    std::vector<int> n_unique;
+    n_unique.push_back(n_classes);
+
+    knn_classify(handle,
+                 pred_labels.data(),
+                 knn_indices.data(),
+                 y,
+                 params.rows,
+                 params.rows,
+                 params.k,
+                 uniq_labels,
+                 n_unique);
+
+    handle.sync_stream(stream);
+  }
+
+ protected:
+  KNNClassifyInputs params;
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  rmm::device_uvector<float> train_samples;
+  rmm::device_uvector<int> train_labels;
+
+  rmm::device_uvector<int> pred_labels;
+
+  rmm::device_uvector<int64_t> knn_indices;
+  rmm::device_uvector<float> knn_dists;
+};
+
+typedef KNNClassifyTest KNNClassifyTestF;
+TEST_P(KNNClassifyTestF, Fit)
+{
+  ASSERT_TRUE(
+    devArrMatch(train_labels.data(), pred_labels.data(), params.rows, raft::Compare<int>()));
+}
+
+const std::vector<KNNClassifyInputs> inputsf = {{100, 10, 2, 0.01f, 2},
+                                                {1000, 10, 5, 0.01f, 2},
+                                                {10000, 10, 5, 0.01f, 2},
+                                                {100, 10, 2, 0.01f, 10},
+                                                {1000, 10, 5, 0.01f, 10},
+                                                {10000, 10, 5, 0.01f, 10},
+                                                {100, 10, 2, 0.01f, 50},
+                                                {1000, 10, 5, 0.01f, 50},
+                                                {10000, 10, 5, 0.01f, 50}};
+
+INSTANTIATE_TEST_CASE_P(KNNClassifyTest, KNNClassifyTestF, ::testing::ValuesIn(inputsf));
+
+};  // end namespace Selection
+};  // namespace MLCommon
\ No newline at end of file
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
new file mode 100644
index 0000000000..5a57006267
--- /dev/null
+++ b/cpp/test/prims/knn_regression.cu
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "test_utils.h"
+
+#include <gtest/gtest.h>
+
+#include <label/classlabels.cuh>
+
+#include <raft/core/cudart_utils.hpp>
+#include <raft/cuda_utils.cuh>
+#include <raft/linalg/reduce.cuh>
+#include <raft/random/rng.cuh>
+#include <raft/spatial/knn/knn.cuh>
+
+#include <rmm/device_uvector.hpp>
+
+#include <selection/knn.cuh>
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+
+#include <iostream>
+#include <vector>
+
+namespace MLCommon {
+namespace Selection {
+
+struct KNNRegressionInputs {
+  int rows;
+  int cols;
+  int n_labels;
+  float cluster_std;
+  int k;
+};
+
+void generate_data(
+  float* out_samples, float* out_labels, int n_rows, int n_cols, cudaStream_t stream)
+{
+  raft::random::Rng r(0ULL, raft::random::GenPC);
+
+  r.uniform(out_samples, n_rows * n_cols, 0.0f, 1.0f, stream);
+
+  raft::linalg::unaryOp<float>(
+    out_samples,
+    out_samples,
+    n_rows,
+    [=] __device__(float input) { return 2 * input - 1; },
+    stream);
+
+  raft::linalg::reduce(
+    out_labels,
+    out_samples,
+    n_cols,
+    n_rows,
+    0.0f,
+    true,
+    true,
+    stream,
+    false,
+    [=] __device__(float in, int n) { return in * in; },
+    raft::Sum<float>(),
+    [=] __device__(float in) { return sqrt(in); });
+
+  thrust::device_ptr<float> d_ptr = thrust::device_pointer_cast(out_labels);
+  float max = *(thrust::max_element(thrust::cuda::par.on(stream), d_ptr, d_ptr + n_rows));
+
+  raft::linalg::unaryOp<float>(
+    out_labels, out_labels, n_rows, [=] __device__(float input) { return input / max; }, stream);
+}
+
+class KNNRegressionTest : public ::testing::TestWithParam<KNNRegressionInputs> {
+ public:
+  KNNRegressionTest()
+    : params(::testing::TestWithParam<KNNRegressionInputs>::GetParam()),
+      stream(handle.get_stream()),
+      train_samples(params.rows * params.cols, stream),
+      train_labels(params.rows, stream),
+      pred_labels(params.rows, stream),
+      knn_indices(params.rows * params.k, stream),
+      knn_dists(params.rows * params.k, stream)
+  {
+  }
+
+ protected:
+  void basicTest()
+  {
+    generate_data(train_samples.data(), train_labels.data(), params.rows, params.cols, stream);
+
+    std::vector<float*> ptrs(1);
+    std::vector<int> sizes(1);
+    ptrs[0]  = train_samples.data();
+    sizes[0] = params.rows;
+
+    raft::spatial::knn::brute_force_knn(handle,
+                                        ptrs,
+                                        sizes,
+                                        params.cols,
+                                        train_samples.data(),
+                                        params.rows,
+                                        knn_indices.data(),
+                                        knn_dists.data(),
+                                        params.k);
+
+    std::vector<float*> y;
+    y.push_back(train_labels.data());
+
+    knn_regress(
+      handle, pred_labels.data(), knn_indices.data(), y, params.rows, params.rows, params.k);
+
+    handle.sync_stream(stream);
+  }
+
+  void SetUp() override { basicTest(); }
+
+ protected:
+  raft::handle_t handle;
+  cudaStream_t stream;
+
+  KNNRegressionInputs params;
+
+  rmm::device_uvector<float> train_samples;
+  rmm::device_uvector<float> train_labels;
+
+  rmm::device_uvector<float> pred_labels;
+
+  rmm::device_uvector<int64_t> knn_indices;
+  rmm::device_uvector<float> knn_dists;
+};
+
+typedef KNNRegressionTest KNNRegressionTestF;
+TEST_P(KNNRegressionTestF, Fit)
+{
+  ASSERT_TRUE(devArrMatch(
+    train_labels.data(), pred_labels.data(), params.rows, raft::CompareApprox<float>(0.3)));
+}
+
+const std::vector<KNNRegressionInputs> inputsf = {{100, 10, 2, 0.01f, 2},
+                                                  {1000, 10, 5, 0.01f, 2},
+                                                  {10000, 10, 5, 0.01f, 2},
+                                                  {100, 10, 2, 0.01f, 10},
+                                                  {1000, 10, 5, 0.01f, 10},
+                                                  {10000, 10, 5, 0.01f, 10},
+                                                  {100, 10, 2, 0.01f, 15},
+                                                  {1000, 10, 5, 0.01f, 15},
+                                                  {10000, 10, 5, 0.01f, 15}};
+
+INSTANTIATE_TEST_CASE_P(KNNRegressionTest, KNNRegressionTestF, ::testing::ValuesIn(inputsf));
+
+};  // end namespace Selection
+};  // namespace MLCommon
\ No newline at end of file

From c97694e2cbc3fc606fa366a7cc1b77c735343c8e Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 15 Oct 2022 14:21:55 -0400
Subject: [PATCH 30/38] Fixing information criterion use

---
 cpp/src/arima/batched_arima.cu                | 312 ++++++++++++++++++
 .../prims/batched/information_criterion.cu    | 149 ---------
 2 files changed, 312 insertions(+), 149 deletions(-)
 delete mode 100644 cpp/test/prims/batched/information_criterion.cu

diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index d1587b838a..bcb911af43 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -31,6 +31,7 @@
 
 #include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
+<<<<<<< Updated upstream
 <<<<<<< HEAD
 #include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
@@ -40,6 +41,9 @@
 #include <raft/stats/information_criterion.hpp>
 =======
 #include <metrics/batched/information_criterion.cuh>
+=======
+#include <raft/stats/information_criterion.cuh>
+>>>>>>> Stashed changes
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/core/nvtx.hpp>
@@ -571,8 +575,316 @@ namespace ML
                     method,
                     truncate);
 
+<<<<<<< Updated upstream
     for (int i = 0; i < N; i++) {
       // Add the perturbation to the i-th parameter
+=======
+    // First derivative with a first-order accuracy
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
+      });
+
+    // Reset the i-th parameter
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_x_pert[N * bid + i] = d_x[N * bid + i];
+      });
+  }
+}
+
+void information_criterion(raft::handle_t& handle,
+                           const ARIMAMemory<double>& arima_mem,
+                           const double* d_y,
+                           const double* d_exog,
+                           int batch_size,
+                           int n_obs,
+                           const ARIMAOrder& order,
+                           const ARIMAParams<double>& params,
+                           double* d_ic,
+                           int ic_type)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  auto stream = handle.get_stream();
+
+  /* Compute log-likelihood in d_ic */
+  batched_loglike(
+    handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
+
+  /* Compute information criterion from log-likelihood and base term */
+  raft::stats::information_criterion_batched(handle,
+                                             d_ic,
+                                             d_ic,
+                                             static_cast<MLCommon::Metrics::IC_Type>(ic_type),
+                                             order.complexity(),
+                                             batch_size,
+                                             n_obs - order.n_diff());
+}
+
+/**
+ * Test that the parameters are valid for the inverse transform
+ *
+ * @tparam isAr        Are these (S)AR or (S)MA parameters?
+ * @param[in]  params  Parameters
+ * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
+ */
+template <bool isAr>
+DI bool test_invparams(const double* params, int pq)
+{
+  double new_params[8];
+  double tmp[8];
+
+  constexpr double coef = isAr ? 1 : -1;
+
+  for (int i = 0; i < pq; i++) {
+    tmp[i] = params[i];
+    new_params[i] = tmp[i];
+  }
+
+  // Perform inverse transform and stop before atanh step
+  for (int j = pq - 1; j > 0; --j) {
+    double a = new_params[j];
+    for (int k = 0; k < j; ++k) {
+      tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
+    }
+    for (int iter = 0; iter < j; ++iter) {
+      new_params[iter] = tmp[iter];
+    }
+  }
+
+  // Verify that the values are between -1 and 1
+  bool result = true;
+  for (int i = 0; i < pq; i++) {
+    result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
+  }
+  return result;
+}
+
+/**
+ * Auxiliary function of _start_params: least square approximation of an
+ * ARMA model (with or without seasonality)
+ * @note: in this function the non-seasonal case has s=1, not s=0!
+ */
+void _arma_least_squares(raft::handle_t& handle,
+                         double* d_ar,
+                         double* d_ma,
+                         double* d_sigma2,
+                         const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
+                         int p,
+                         int q,
+                         int s,
+                         bool estimate_sigma2,
+                         int k = 0,
+                         double* d_mu = nullptr)
+{
+  const auto& handle_impl = handle;
+  auto stream = handle_impl.get_stream();
+  auto cublas_handle = handle_impl.get_cublas_handle();
+  auto counting = thrust::make_counting_iterator(0);
+
+  int batch_size = bm_y.batches();
+  int n_obs = bm_y.shape().first;
+
+  int ps = p * s, qs = q * s;
+  int p_ar = std::max(ps, 2 * qs);
+  int r = std::max(p_ar + qs, ps);
+
+  if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
+    // Too few observations for the estimate, fill with 0 (1 for sigma2)
+    if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
+    if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
+    if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
+    if (estimate_sigma2) {
+      thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
+      thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
+    }
+    return;
+  }
+
+  /* Matrix formed by lag matrices of y and the residuals respectively,
+   * side by side. The left side will be used to estimate AR, the right
+   * side to estimate MA */
+  MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
+    n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
+  int ar_offset = r - ps;
+  int res_offset = r - p_ar - qs;
+
+  // Get residuals from an AR(p_ar) model to estimate the MA parameters
+  if (q) {
+    // Create lagged y
+    int ls_height = n_obs - p_ar;
+    MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
+      MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
+
+    /* Matrix for the initial AR fit, initialized by copy of y
+     * (note: this is because gels works in-place ; the matrix has larger
+     *  dimensions than the actual AR fit) */
+    MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
+      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
+
+    // Residual, initialized as offset y to avoid one kernel call
+    MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
+
+    // Initial AR fit
+    MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
+
+    // Compute residual (technically a gemv)
+    MLCommon::LinAlg::Batched::b_gemm(
+      false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
+
+    // Lags of the residual
+    MLCommon::LinAlg::Batched::b_lagged_mat(
+      bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
+  }
+
+  // Fill the first column of the matrix with 1 if we fit an intercept
+  if (k) {
+    double* d_ls_ar_res = bm_ls_ar_res.raw_data();
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
+        for (int i = 0; i < n_obs - r; i++) {
+          b_ls_ar_res[i] = 1.0;
+        }
+      });
+  }
+
+  // Lags of y
+  MLCommon::LinAlg::Batched::b_lagged_mat(
+    bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
+
+  /* Initializing the vector for the ARMA fit
+   * (note: also in-place as described for AR fit) */
+  MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
+    MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
+
+  // The residuals will be computed only if sigma2 is requested
+  MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
+    n_obs - r, 1, batch_size, cublas_handle, stream, false);
+  if (estimate_sigma2) {
+    raft::copy(
+      bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
+  }
+
+  // ARMA fit
+  MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
+
+  // Copy the results in the parameter vectors
+  const double* d_arma_fit = bm_arma_fit.raw_data();
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
+      if (k) { d_mu[bid] = b_arma_fit[0]; }
+      if (p) {
+        double* b_ar = d_ar + bid * p;
+        for (int i = 0; i < p; i++) {
+          b_ar[i] = b_arma_fit[i + k];
+        }
+      }
+      if (q) {
+        double* b_ma = d_ma + bid * q;
+        for (int i = 0; i < q; i++) {
+          b_ma[i] = b_arma_fit[i + p + k];
+        }
+      }
+    });
+
+  if (estimate_sigma2) {
+    // Compute final residual (technically a gemv)
+    MLCommon::LinAlg::Batched::b_gemm(false,
+                                      false,
+                                      n_obs - r,
+                                      1,
+                                      p + q + k,
+                                      -1.0,
+                                      bm_ls_ar_res,
+                                      bm_arma_fit,
+                                      1.0,
+                                      bm_final_residual);
+
+    // Compute variance
+    double* d_residual = bm_final_residual.raw_data();
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        double acc = 0.0;
+        const double* b_residual = d_residual + (n_obs - r) * bid;
+        for (int i = q; i < n_obs - r; i++) {
+          double res = b_residual[i];
+          acc += res * res;
+        }
+        d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
+      });
+  }
+
+  // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      if (p) {
+        double* b_ar = d_ar + bid * p;
+        bool valid = test_invparams<true>(b_ar, p);
+        if (!valid) {
+          for (int ip = 0; ip < p; ip++)
+            b_ar[ip] = 0;
+        }
+      }
+      if (q) {
+        double* b_ma = d_ma + bid * q;
+        bool valid = test_invparams<false>(b_ma, q);
+        if (!valid) {
+          for (int iq = 0; iq < q; iq++)
+            b_ma[iq] = 0;
+        }
+      }
+    });
+}
+
+/**
+ * Auxiliary function of estimate_x0: compute the starting parameters for
+ * the series pre-processed by estimate_x0
+ */
+void _start_params(raft::handle_t& handle,
+                   ARIMAParams<double>& params,
+                   MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
+                   const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
+                   const ARIMAOrder& order)
+{
+  int batch_size = bm_exog.batches();
+  cudaStream_t stream = bm_exog.stream();
+
+  // Estimate exog coefficients and subtract component to endog.
+  // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
+  if (order.n_exog > 0) {
+    // In most cases, the system will be overdetermined and we can use gels
+    if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
+      // Make a copy of the exogenous series for in-place gels
+      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
+      // Make a copy of the endogenous series for in-place gels
+      MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
+
+      // Least-squares solution of overdetermined system
+      rmm::device_uvector<int> info(batch_size, stream);
+      b_gels(bm_exog_copy, bm_y_copy, info.data());
+
+      // Make a batched matrix around the exogenous coefficients
+      rmm::device_uvector<double*> beta_pointers(batch_size, stream);
+      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
+                                                             1,
+                                                             batch_size,
+                                                             bm_exog.cublasHandle(),
+                                                             beta_pointers.data(),
+                                                             params.beta,
+                                                             stream,
+                                                             false);
+
+      // Copy the solution of the system to the parameters array
+      b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
+
+      // Set parameters to zero when solving was not successful
+      auto counting = thrust::make_counting_iterator(0);
+      int* devInfoArray = info.data();
+      double* d_exog_coef = bm_exog_coef.raw_data();
+      const int& n_exog = order.n_exog;
+>>>>>>> Stashed changes
       thrust::for_each(
         thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
           d_x_pert[N * bid + i] = d_x[N * bid + i] + h;
diff --git a/cpp/test/prims/batched/information_criterion.cu b/cpp/test/prims/batched/information_criterion.cu
deleted file mode 100644
index 1bf1b8ade6..0000000000
--- a/cpp/test/prims/batched/information_criterion.cu
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <test_utils.h>
-
-#include <metrics/batched/information_criterion.cuh>
-
-#include <raft/core/cudart_utils.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <gtest/gtest.h>
-
-#include <cmath>
-#include <random>
-#include <vector>
-
-namespace MLCommon {
-namespace Metrics {
-namespace Batched {
-
-template <typename T>
-void naive_ic(
-  T* h_ic, const T* h_loglike, IC_Type ic_type, int n_params, int batch_size, int n_samples)
-{
-  T ic_base{};
-  T N = static_cast<T>(n_params);
-  T M = static_cast<T>(n_samples);
-  switch (ic_type) {
-    case AIC: ic_base = (T)2 * N; break;
-    case AICc: ic_base = (T)2 * (N + (N * (N + (T)1)) / (M - N - (T)1)); break;
-    case BIC: ic_base = std::log(M) * N; break;
-  }
-#pragma omp parallel for
-  for (int bid = 0; bid < batch_size; bid++) {
-    h_ic[bid] = ic_base - (T)2.0 * h_loglike[bid];
-  }
-}
-
-template <typename T>
-struct BatchedICInputs {
-  int batch_size;
-  int n_params;
-  int n_samples;
-  IC_Type ic_type;
-  T tolerance;
-};
-
-template <typename T>
-class BatchedICTest : public ::testing::TestWithParam<BatchedICInputs<T>> {
- protected:
-  BatchedICTest()
-    : params(::testing::TestWithParam<BatchedICInputs<T>>::GetParam()),
-      res_d(sizeof(T) * params.batch_size, stream)
-  {
-  }
-
-  void SetUp() override
-  {
-    using std::vector;
-
-    // Create stream
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    // Create arrays
-    std::vector<T> loglike_h = std::vector<T>(params.batch_size);
-    res_h.resize(params.batch_size);
-    rmm::device_uvector<T> loglike_d(sizeof(T) * params.batch_size, stream);
-
-    // Generate random data
-    std::random_device rd;
-    std::mt19937 gen(rd());
-    std::uniform_real_distribution<T> udis(0.001, 1.0);  // 0 has no log
-    for (int i = 0; i < params.batch_size; i++)
-      loglike_h[i] = std::log(udis(gen));
-
-    // Copy the data to the device
-    raft::update_device(loglike_d.data(), loglike_h.data(), params.batch_size, stream);
-
-    // Compute the tested results
-    information_criterion(res_d.data(),
-                          loglike_d.data(),
-                          params.ic_type,
-                          params.n_params,
-                          params.batch_size,
-                          params.n_samples,
-                          stream);
-
-    // Compute the expected results
-    naive_ic(res_h.data(),
-             loglike_h.data(),
-             params.ic_type,
-             params.n_params,
-             params.batch_size,
-             params.n_samples);
-  }
-
- protected:
-  cudaStream_t stream = 0;
-  BatchedICInputs<T> params;
-  rmm::device_uvector<T> res_d;
-  std::vector<T> res_h;
-};
-
-// Test parameters (op, n_batches, m, n, p, q, tolerance)
-const std::vector<BatchedICInputs<double>> inputsd = {
-  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
-
-// Test parameters (op, n_batches, m, n, p, q, tolerance)
-const std::vector<BatchedICInputs<float>> inputsf = {
-  {1, 5, 52, AIC, 1e-3}, {10, 7, 100, AICc, 1e-3}, {67, 2, 350, BIC, 1e-3}};
-
-using BatchedICTestD = BatchedICTest<double>;
-using BatchedICTestF = BatchedICTest<float>;
-TEST_P(BatchedICTestD, Result)
-{
-  ASSERT_TRUE(devArrMatchHost(res_h.data(),
-                              res_d.data(),
-                              params.batch_size,
-                              raft::CompareApprox<double>(params.tolerance),
-                              stream));
-}
-TEST_P(BatchedICTestF, Result)
-{
-  ASSERT_TRUE(devArrMatchHost(res_h.data(),
-                              res_d.data(),
-                              params.batch_size,
-                              raft::CompareApprox<float>(params.tolerance),
-                              stream));
-}
-
-INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestD, ::testing::ValuesIn(inputsd));
-INSTANTIATE_TEST_CASE_P(BatchedICTests, BatchedICTestF, ::testing::ValuesIn(inputsf));
-
-}  // namespace Batched
-}  // namespace Metrics
-}  // namespace MLCommon

From 011ebb0abbeeb7c962c61d67ea58fb1db82dcb26 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 15 Oct 2022 14:50:34 -0400
Subject: [PATCH 31/38] Fixing bad merge

---
 cpp/src/arima/batched_arima.cu | 1813 ++++++++++++++++----------------
 1 file changed, 897 insertions(+), 916 deletions(-)

diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index bcb911af43..b65bb35033 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -29,568 +29,549 @@
 #include <cuml/tsa/batched_arima.hpp>
 #include <cuml/tsa/batched_kalman.hpp>
 
-#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
-<<<<<<< Updated upstream
-<<<<<<< HEAD
-#include <raft/common/nvtx.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/cudart_utils.h>
-#include <raft/handle.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/stats/information_criterion.hpp>
-=======
-#include <metrics/batched/information_criterion.cuh>
-=======
-#include <raft/stats/information_criterion.cuh>
->>>>>>> Stashed changes
-#include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
 #include <raft/core/nvtx.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/matrix_vector_op.cuh>
->>>>>>> branch-22.10
+#include <raft/linalg/matrix_vector_op.hpp>
+#include <raft/stats/information_criterion.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <timeSeries/arima_helpers.cuh>
 #include <timeSeries/fillna.cuh>
 
-namespace ML
+namespace ML {
+void pack(raft::handle_t& handle,
+          const ARIMAParams<double>& params,
+          const ARIMAOrder& order,
+          int batch_size,
+          double* param_vec)
 {
-  void pack(raft::handle_t & handle,
-            const ARIMAParams<double>& params,
+  const auto stream = handle.get_stream();
+  params.pack(order, batch_size, param_vec, stream);
+}
+
+void unpack(raft::handle_t& handle,
+            ARIMAParams<double>& params,
             const ARIMAOrder& order,
             int batch_size,
-            double* param_vec)
-  {
-    const auto stream = handle.get_stream();
-    params.pack(order, batch_size, param_vec, stream);
-  }
+            const double* param_vec)
+{
+  const auto stream = handle.get_stream();
+  params.unpack(order, batch_size, param_vec, stream);
+}
 
-  void unpack(raft::handle_t & handle,
-              ARIMAParams<double> & params,
-              const ARIMAOrder& order,
-              int batch_size,
-              const double* param_vec)
-  {
-    const auto stream = handle.get_stream();
-    params.unpack(order, batch_size, param_vec, stream);
-  }
+void batched_diff(raft::handle_t& handle,
+                  double* d_y_diff,
+                  const double* d_y,
+                  int batch_size,
+                  int n_obs,
+                  const ARIMAOrder& order)
+{
+  const auto stream = handle.get_stream();
+  MLCommon::TimeSeries::prepare_data(
+    d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
+}
 
-  void batched_diff(raft::handle_t & handle,
-                    double* d_y_diff,
-                    const double* d_y,
-                    int batch_size,
-                    int n_obs,
-                    const ARIMAOrder& order)
-  {
-    const auto stream = handle.get_stream();
-    MLCommon::TimeSeries::prepare_data(
-      d_y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
-  }
+template <typename T>
+struct is_missing {
+  typedef T argument_type;
+  typedef T result_type;
 
-  template <typename T>
-  struct is_missing {
-    typedef T argument_type;
-    typedef T result_type;
+  __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const { return isnan(x); }
+};  // end is_missing
 
-    __thrust_exec_check_disable__ __device__ const T operator()(const T& x) const
-    {
-      return isnan(x);
-    }
-  };  // end is_missing
+bool detect_missing(raft::handle_t& handle, const double* d_y, int n_elem)
+{
+  return thrust::any_of(
+    thrust::cuda::par.on(handle.get_stream()), d_y, d_y + n_elem, is_missing<double>());
+}
 
-  bool detect_missing(raft::handle_t & handle, const double* d_y, int n_elem)
-  {
-    return thrust::any_of(
-      thrust::cuda::par.on(handle.get_stream()), d_y, d_y + n_elem, is_missing<double>());
-  }
+void predict(raft::handle_t& handle,
+             const ARIMAMemory<double>& arima_mem,
+             const double* d_y,
+             const double* d_exog,
+             const double* d_exog_fut,
+             int batch_size,
+             int n_obs,
+             int start,
+             int end,
+             const ARIMAOrder& order,
+             const ARIMAParams<double>& params,
+             double* d_y_p,
+             bool pre_diff,
+             double level,
+             double* d_lower,
+             double* d_upper)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  const auto stream = handle.get_stream();
+
+  bool diff     = order.need_diff() && pre_diff && level == 0;
+  int num_steps = std::max(end - n_obs, 0);
+
+  // Prepare data
+  int n_obs_kf;
+  const double* d_y_kf;
+  const double* d_exog_kf;
+  const double* d_exog_fut_kf = d_exog_fut;
+  ARIMAOrder order_after_prep = order;
+  rmm::device_uvector<double> exog_fut_buffer(0, stream);
+  if (diff) {
+    n_obs_kf = n_obs - order.n_diff();
+    MLCommon::TimeSeries::prepare_data(
+      arima_mem.y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
+    if (order.n_exog > 0) {
+      MLCommon::TimeSeries::prepare_data(arima_mem.exog_diff,
+                                         d_exog,
+                                         order.n_exog * batch_size,
+                                         n_obs,
+                                         order.d,
+                                         order.D,
+                                         order.s,
+                                         stream);
 
-  void predict(raft::handle_t & handle,
-               const ARIMAMemory<double>& arima_mem,
-               const double* d_y,
-               const double* d_exog,
-               const double* d_exog_fut,
-               int batch_size,
-               int n_obs,
-               int start,
-               int end,
-               const ARIMAOrder& order,
-               const ARIMAParams<double>& params,
-               double* d_y_p,
-               bool pre_diff,
-               double level,
-               double* d_lower,
-               double* d_upper)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-    const auto stream = handle.get_stream();
-
-    bool diff     = order.need_diff() && pre_diff && level == 0;
-    int num_steps = std::max(end - n_obs, 0);
-
-    // Prepare data
-    int n_obs_kf;
-    const double* d_y_kf;
-    const double* d_exog_kf;
-    const double* d_exog_fut_kf = d_exog_fut;
-    ARIMAOrder order_after_prep = order;
-    rmm::device_uvector<double> exog_fut_buffer(0, stream);
-    if (diff) {
-      n_obs_kf = n_obs - order.n_diff();
-      MLCommon::TimeSeries::prepare_data(
-        arima_mem.y_diff, d_y, batch_size, n_obs, order.d, order.D, order.s, stream);
-      if (order.n_exog > 0) {
-        MLCommon::TimeSeries::prepare_data(arima_mem.exog_diff,
-                                           d_exog,
-                                           order.n_exog * batch_size,
-                                           n_obs,
-                                           order.d,
-                                           order.D,
-                                           order.s,
-                                           stream);
-
-        if (num_steps > 0) {
-          exog_fut_buffer.resize(num_steps * order.n_exog * batch_size, stream);
-
-          MLCommon::TimeSeries::prepare_future_data(exog_fut_buffer.data(),
-                                                    d_exog,
-                                                    d_exog_fut,
-                                                    order.n_exog * batch_size,
-                                                    n_obs,
-                                                    num_steps,
-                                                    order.d,
-                                                    order.D,
-                                                    order.s,
-                                                    stream);
-
-          d_exog_fut_kf = exog_fut_buffer.data();
-        }
+      if (num_steps > 0) {
+        exog_fut_buffer.resize(num_steps * order.n_exog * batch_size, stream);
+
+        MLCommon::TimeSeries::prepare_future_data(exog_fut_buffer.data(),
+                                                  d_exog,
+                                                  d_exog_fut,
+                                                  order.n_exog * batch_size,
+                                                  n_obs,
+                                                  num_steps,
+                                                  order.d,
+                                                  order.D,
+                                                  order.s,
+                                                  stream);
+
+        d_exog_fut_kf = exog_fut_buffer.data();
       }
-      order_after_prep.d = 0;
-      order_after_prep.D = 0;
-
-      d_y_kf    = arima_mem.y_diff;
-      d_exog_kf = arima_mem.exog_diff;
-    } else {
-      n_obs_kf  = n_obs;
-      d_y_kf    = d_y;
-      d_exog_kf = d_exog;
     }
+    order_after_prep.d = 0;
+    order_after_prep.D = 0;
+
+    d_y_kf    = arima_mem.y_diff;
+    d_exog_kf = arima_mem.exog_diff;
+  } else {
+    n_obs_kf  = n_obs;
+    d_y_kf    = d_y;
+    d_exog_kf = d_exog;
+  }
 
-    double* d_pred = arima_mem.pred;
-
-    // Create temporary array for the forecasts
-    rmm::device_uvector<double> fc_buffer(num_steps * batch_size, stream);
-    double* d_y_fc = fc_buffer.data();
+  double* d_pred = arima_mem.pred;
+
+  // Create temporary array for the forecasts
+  rmm::device_uvector<double> fc_buffer(num_steps * batch_size, stream);
+  double* d_y_fc = fc_buffer.data();
+
+  // Compute the residual and forecast
+  std::vector<double> loglike = std::vector<double>(batch_size);
+  /// TODO: use device loglike to avoid useless copy ; part of #2233
+  batched_loglike(handle,
+                  arima_mem,
+                  d_y_kf,
+                  d_exog_kf,
+                  batch_size,
+                  n_obs_kf,
+                  order_after_prep,
+                  params,
+                  loglike.data(),
+                  false,
+                  true,
+                  MLE,
+                  0,
+                  num_steps,
+                  d_y_fc,
+                  d_exog_fut_kf,
+                  level,
+                  d_lower,
+                  d_upper);
+
+  auto counting  = thrust::make_counting_iterator(0);
+  int predict_ld = end - start;
+
+  //
+  // In-sample prediction
+  //
+
+  // The prediction loop starts by filling undefined predictions with NaN,
+  // then computes the predictions from the observations and residuals
+  if (start < n_obs) {
+    int res_offset = diff ? order.d + order.s * order.D : 0;
+    int p_start    = std::max(start, res_offset);
+    int p_end      = std::min(n_obs, end);
+    int dD         = diff ? order.d + order.D : 0;
+    int period1    = order.d ? 1 : order.s;
+    int period2    = order.d == 2 ? 1 : order.s;
 
-    // Compute the residual and forecast
-    std::vector<double> loglike = std::vector<double>(batch_size);
-    /// TODO: use device loglike to avoid useless copy ; part of #2233
-    batched_loglike(handle,
-                    arima_mem,
-                    d_y_kf,
-                    d_exog_kf,
-                    batch_size,
-                    n_obs_kf,
-                    order_after_prep,
-                    params,
-                    loglike.data(),
-                    false,
-                    true,
-                    MLE,
-                    0,
-                    num_steps,
-                    d_y_fc,
-                    d_exog_fut_kf,
-                    level,
-                    d_lower,
-                    d_upper);
-
-    auto counting  = thrust::make_counting_iterator(0);
-    int predict_ld = end - start;
-
-    //
-    // In-sample prediction
-    //
-
-    // The prediction loop starts by filling undefined predictions with NaN,
-    // then computes the predictions from the observations and residuals
-    if (start < n_obs) {
-      int res_offset = diff ? order.d + order.s * order.D : 0;
-      int p_start    = std::max(start, res_offset);
-      int p_end      = std::min(n_obs, end);
-      int dD         = diff ? order.d + order.D : 0;
-      int period1    = order.d ? 1 : order.s;
-      int period2    = order.d == 2 ? 1 : order.s;
-
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          d_y_p[0] = 0.0;
-          for (int i = 0; i < res_offset - start; i++) {
-            d_y_p[bid * predict_ld + i] = nan("");
-          }
-          for (int i = p_start; i < p_end; i++) {
-            if (dD == 0) {
-              d_y_p[bid * predict_ld + i - start] = d_pred[bid * n_obs + i];
-            } else if (dD == 1) {
-              d_y_p[bid * predict_ld + i - start] =
-                d_y[bid * n_obs + i - period1] + d_pred[bid * n_obs_kf + i - res_offset];
-            } else {
-              d_y_p[bid * predict_ld + i - start] =
-                d_y[bid * n_obs + i - period1] + d_y[bid * n_obs + i - period2] -
-                d_y[bid * n_obs + i - period1 - period2] + d_pred[bid * n_obs_kf + i - res_offset];
-            }
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_y_p[0] = 0.0;
+        for (int i = 0; i < res_offset - start; i++) {
+          d_y_p[bid * predict_ld + i] = nan("");
+        }
+        for (int i = p_start; i < p_end; i++) {
+          if (dD == 0) {
+            d_y_p[bid * predict_ld + i - start] = d_pred[bid * n_obs + i];
+          } else if (dD == 1) {
+            d_y_p[bid * predict_ld + i - start] =
+              d_y[bid * n_obs + i - period1] + d_pred[bid * n_obs_kf + i - res_offset];
+          } else {
+            d_y_p[bid * predict_ld + i - start] =
+              d_y[bid * n_obs + i - period1] + d_y[bid * n_obs + i - period2] -
+              d_y[bid * n_obs + i - period1 - period2] + d_pred[bid * n_obs_kf + i - res_offset];
           }
-        });
-    }
-
-    //
-    // Finalize out-of-sample forecast and copy in-sample predictions
-    //
+        }
+      });
+  }
 
-    if (num_steps) {
-      if (diff) {
-        MLCommon::TimeSeries::finalize_forecast(
-          d_y_fc, d_y, num_steps, batch_size, n_obs, n_obs, order.d, order.D, order.s, stream);
-      }
+  //
+  // Finalize out-of-sample forecast and copy in-sample predictions
+  //
 
-      // Copy forecast in d_y_p
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          for (int i = 0; i < num_steps; i++) {
-            d_y_p[bid * predict_ld + n_obs - start + i] = d_y_fc[num_steps * bid + i];
-          }
-        });
-      /// TODO: 2D copy kernel?
+  if (num_steps) {
+    if (diff) {
+      MLCommon::TimeSeries::finalize_forecast(
+        d_y_fc, d_y, num_steps, batch_size, n_obs, n_obs, order.d, order.D, order.s, stream);
     }
-  }
 
-  /**
-   * Kernel to compute the sum-of-squares log-likelihood estimation
-   *
-   * @param[in]  d_y        Series to fit
-   * @param[in]  d_mu       mu parameters
-   * @param[in]  d_ar       AR parameters
-   * @param[in]  d_ma       MA parameters
-   * @param[in]  d_sar      Seasonal AR parameters
-   * @param[in]  d_sma      Seasonal MA parameters
-   * @param[out] d_loglike  Evaluated log-likelihood
-   * @param[in]  n_obs      Number of observations in a time series
-   * @param[in]  n_phi      Number of phi coefficients (combined AR-SAR)
-   * @param[in]  n_theta    Number of theta coefficients (combined MA-SMA)
-   * @param[in]  p          Number of AR parameters
-   * @param[in]  q          Number of MA parameters
-   * @param[in]  P          Number of seasonal AR parameters
-   * @param[in]  Q          Number of seasonal MA parameters
-   * @param[in]  s          Seasonal period or 0
-   * @param[in]  k          Whether to use an intercept
-   * @param[in]  start_sum  At which index to start the sum
-   * @param[in]  start_y    First used y index (observation)
-   * @param[in]  start_v    First used v index (residual)
-   */
-  template <typename DataT>
-  __global__ void sum_of_squares_kernel(const DataT* d_y,
-                                        const DataT* d_mu,
-                                        const DataT* d_ar,
-                                        const DataT* d_ma,
-                                        const DataT* d_sar,
-                                        const DataT* d_sma,
-                                        DataT* d_loglike,
-                                        int n_obs,
-                                        int n_phi,
-                                        int n_theta,
-                                        int p,
-                                        int q,
-                                        int P,
-                                        int Q,
-                                        int s,
-                                        int k,
-                                        int start_sum,
-                                        int start_y,
-                                        int start_v)
-  {
-    // Load phi, theta and mu to registers
-    DataT phi, theta;
-    if (threadIdx.x < n_phi) {
-      phi = MLCommon::TimeSeries::reduced_polynomial<true>(
-        blockIdx.x, d_ar, p, d_sar, P, s, threadIdx.x + 1);
-    }
-    if (threadIdx.x < n_theta) {
-      theta = MLCommon::TimeSeries::reduced_polynomial<false>(
-        blockIdx.x, d_ma, q, d_sma, Q, s, threadIdx.x + 1);
-    }
-    DataT mu = k ? d_mu[blockIdx.x] : (DataT)0;
-
-    // Shared memory: load y and initialize the residuals
-    extern __shared__ DataT shared_mem[];
-    DataT* b_y  = shared_mem;
-    DataT* b_vs = shared_mem + n_obs - start_y;
-    for (int i = threadIdx.x; i < n_obs - start_y; i += blockDim.x) {
-      b_y[i] = d_y[n_obs * blockIdx.x + i + start_y];
-    }
-    for (int i = threadIdx.x; i < start_sum - start_v; i += blockDim.x) {
-      b_vs[i] = (DataT)0;
-    }
+    // Copy forecast in d_y_p
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        for (int i = 0; i < num_steps; i++) {
+          d_y_p[bid * predict_ld + n_obs - start + i] = d_y_fc[num_steps * bid + i];
+        }
+      });
+    /// TODO: 2D copy kernel?
+  }
+}
 
-    // Main loop
-    char* temp_smem = (char*)(shared_mem + 2 * n_obs - start_y - start_v);
-    DataT res, ssq = 0;
-    for (int i = start_sum; i < n_obs; i++) {
-      __syncthreads();
-      res = (DataT)0;
-      res -= threadIdx.x < n_phi ? phi * b_y[i - threadIdx.x - 1 - start_y] : (DataT)0;
-      res -= threadIdx.x < n_theta ? theta * b_vs[i - threadIdx.x - 1 - start_v] : (DataT)0;
-      res = raft::blockReduce(res, temp_smem);
-      if (threadIdx.x == 0) {
-        res += b_y[i - start_y] - mu;
-        b_vs[i - start_v] = res;
-        ssq += res * res;
-      }
-    }
+/**
+ * Kernel to compute the sum-of-squares log-likelihood estimation
+ *
+ * @param[in]  d_y        Series to fit
+ * @param[in]  d_mu       mu parameters
+ * @param[in]  d_ar       AR parameters
+ * @param[in]  d_ma       MA parameters
+ * @param[in]  d_sar      Seasonal AR parameters
+ * @param[in]  d_sma      Seasonal MA parameters
+ * @param[out] d_loglike  Evaluated log-likelihood
+ * @param[in]  n_obs      Number of observations in a time series
+ * @param[in]  n_phi      Number of phi coefficients (combined AR-SAR)
+ * @param[in]  n_theta    Number of theta coefficients (combined MA-SMA)
+ * @param[in]  p          Number of AR parameters
+ * @param[in]  q          Number of MA parameters
+ * @param[in]  P          Number of seasonal AR parameters
+ * @param[in]  Q          Number of seasonal MA parameters
+ * @param[in]  s          Seasonal period or 0
+ * @param[in]  k          Whether to use an intercept
+ * @param[in]  start_sum  At which index to start the sum
+ * @param[in]  start_y    First used y index (observation)
+ * @param[in]  start_v    First used v index (residual)
+ */
+template <typename DataT>
+__global__ void sum_of_squares_kernel(const DataT* d_y,
+                                      const DataT* d_mu,
+                                      const DataT* d_ar,
+                                      const DataT* d_ma,
+                                      const DataT* d_sar,
+                                      const DataT* d_sma,
+                                      DataT* d_loglike,
+                                      int n_obs,
+                                      int n_phi,
+                                      int n_theta,
+                                      int p,
+                                      int q,
+                                      int P,
+                                      int Q,
+                                      int s,
+                                      int k,
+                                      int start_sum,
+                                      int start_y,
+                                      int start_v)
+{
+  // Load phi, theta and mu to registers
+  DataT phi, theta;
+  if (threadIdx.x < n_phi) {
+    phi = MLCommon::TimeSeries::reduced_polynomial<true>(
+      blockIdx.x, d_ar, p, d_sar, P, s, threadIdx.x + 1);
+  }
+  if (threadIdx.x < n_theta) {
+    theta = MLCommon::TimeSeries::reduced_polynomial<false>(
+      blockIdx.x, d_ma, q, d_sma, Q, s, threadIdx.x + 1);
+  }
+  DataT mu = k ? d_mu[blockIdx.x] : (DataT)0;
+
+  // Shared memory: load y and initialize the residuals
+  extern __shared__ DataT shared_mem[];
+  DataT* b_y  = shared_mem;
+  DataT* b_vs = shared_mem + n_obs - start_y;
+  for (int i = threadIdx.x; i < n_obs - start_y; i += blockDim.x) {
+    b_y[i] = d_y[n_obs * blockIdx.x + i + start_y];
+  }
+  for (int i = threadIdx.x; i < start_sum - start_v; i += blockDim.x) {
+    b_vs[i] = (DataT)0;
+  }
 
-    // Compute log-likelihood and write it to global memory
+  // Main loop
+  char* temp_smem = (char*)(shared_mem + 2 * n_obs - start_y - start_v);
+  DataT res, ssq = 0;
+  for (int i = start_sum; i < n_obs; i++) {
+    __syncthreads();
+    res = (DataT)0;
+    res -= threadIdx.x < n_phi ? phi * b_y[i - threadIdx.x - 1 - start_y] : (DataT)0;
+    res -= threadIdx.x < n_theta ? theta * b_vs[i - threadIdx.x - 1 - start_v] : (DataT)0;
+    res = raft::blockReduce(res, temp_smem);
     if (threadIdx.x == 0) {
-      d_loglike[blockIdx.x] =
-        -0.5 * static_cast<DataT>(n_obs) * raft::myLog(ssq / static_cast<DataT>(n_obs - start_sum));
+      res += b_y[i - start_y] - mu;
+      b_vs[i - start_v] = res;
+      ssq += res * res;
     }
   }
 
-  /**
-   * Sum-of-squares estimation method
-   *
-   * @param[in]  handle     cuML handle
-   * @param[in]  d_y        Series to fit: shape = (n_obs, batch_size)
-   * @param[in]  batch_size Number of time series
-   * @param[in]  n_obs      Number of observations in a time series
-   * @param[in]  order      ARIMA hyper-parameters
-   * @param[in]  Tparams    Transformed parameters
-   * @param[out] d_loglike  Evaluated log-likelihood (device)
-   * @param[in]  truncate   Number of observations to skip in the sum
-   */
-  void conditional_sum_of_squares(raft::handle_t & handle,
-                                  const double* d_y,
-                                  int batch_size,
-                                  int n_obs,
-                                  const ARIMAOrder& order,
-                                  const ARIMAParams<double>& Tparams,
-                                  double* d_loglike,
-                                  int truncate)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-    auto stream = handle.get_stream();
-
-    int n_phi     = order.n_phi();
-    int n_theta   = order.n_theta();
-    int max_lags  = std::max(n_phi, n_theta);
-    int start_sum = std::max(max_lags, truncate);
-    int start_y   = start_sum - n_phi;
-    int start_v   = start_sum - n_theta;
-
-    // Compute the sum-of-squares and the log-likelihood
-    int n_warps            = std::max(raft::ceildiv<int>(max_lags, 32), 1);
-    size_t shared_mem_size = (2 * n_obs - start_y - start_v + n_warps) * sizeof(double);
-    sum_of_squares_kernel<<<batch_size, 32 * n_warps, shared_mem_size, stream>>>(d_y,
-                                                                                 Tparams.mu,
-                                                                                 Tparams.ar,
-                                                                                 Tparams.ma,
-                                                                                 Tparams.sar,
-                                                                                 Tparams.sma,
-                                                                                 d_loglike,
-                                                                                 n_obs,
-                                                                                 n_phi,
-                                                                                 n_theta,
-                                                                                 order.p,
-                                                                                 order.q,
-                                                                                 order.P,
-                                                                                 order.Q,
-                                                                                 order.s,
-                                                                                 order.k,
-                                                                                 start_sum,
-                                                                                 start_y,
-                                                                                 start_v);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  // Compute log-likelihood and write it to global memory
+  if (threadIdx.x == 0) {
+    d_loglike[blockIdx.x] =
+      -0.5 * static_cast<DataT>(n_obs) * raft::myLog(ssq / static_cast<DataT>(n_obs - start_sum));
   }
+}
 
-  void batched_loglike(raft::handle_t & handle,
-                       const ARIMAMemory<double>& arima_mem,
-                       const double* d_y,
-                       const double* d_exog,
-                       int batch_size,
-                       int n_obs,
-                       const ARIMAOrder& order,
-                       const ARIMAParams<double>& params,
-                       double* loglike,
-                       bool trans,
-                       bool host_loglike,
-                       LoglikeMethod method,
-                       int truncate,
-                       int fc_steps,
-                       double* d_fc,
-                       const double* d_exog_fut,
-                       double level,
-                       double* d_lower,
-                       double* d_upper)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-
-    auto stream = handle.get_stream();
-
-    double* d_pred = arima_mem.pred;
-
-    ARIMAParams<double> Tparams = {params.mu,
-                                   params.beta,
-                                   arima_mem.Tparams_ar,
-                                   arima_mem.Tparams_ma,
-                                   arima_mem.Tparams_sar,
-                                   arima_mem.Tparams_sma,
-                                   arima_mem.Tparams_sigma2};
-
-    ASSERT(method == MLE || fc_steps == 0, "Only MLE method is valid for forecasting");
-
-    /* Create log-likelihood device array if host pointer is provided */
-    double* d_loglike = host_loglike ? arima_mem.loglike : loglike;
-
-    if (trans) {
-      MLCommon::TimeSeries::batched_jones_transform(
-        order, batch_size, false, params, Tparams, stream);
-    } else {
-      // non-transformed case: just use original parameters
-      Tparams.ar     = params.ar;
-      Tparams.ma     = params.ma;
-      Tparams.sar    = params.sar;
-      Tparams.sma    = params.sma;
-      Tparams.sigma2 = params.sigma2;
-    }
+/**
+ * Sum-of-squares estimation method
+ *
+ * @param[in]  handle     cuML handle
+ * @param[in]  d_y        Series to fit: shape = (n_obs, batch_size)
+ * @param[in]  batch_size Number of time series
+ * @param[in]  n_obs      Number of observations in a time series
+ * @param[in]  order      ARIMA hyper-parameters
+ * @param[in]  Tparams    Transformed parameters
+ * @param[out] d_loglike  Evaluated log-likelihood (device)
+ * @param[in]  truncate   Number of observations to skip in the sum
+ */
+void conditional_sum_of_squares(raft::handle_t& handle,
+                                const double* d_y,
+                                int batch_size,
+                                int n_obs,
+                                const ARIMAOrder& order,
+                                const ARIMAParams<double>& Tparams,
+                                double* d_loglike,
+                                int truncate)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  auto stream = handle.get_stream();
 
-    if (method == CSS) {
-      conditional_sum_of_squares(
-        handle, d_y, batch_size, n_obs, order, Tparams, d_loglike, truncate);
-    } else {
-      batched_kalman_filter(handle,
-                            arima_mem,
-                            d_y,
-                            d_exog,
-                            n_obs,
-                            Tparams,
-                            order,
-                            batch_size,
-                            d_loglike,
-                            d_pred,
-                            fc_steps,
-                            d_fc,
-                            d_exog_fut,
-                            level,
-                            d_lower,
-                            d_upper);
-    }
+  int n_phi     = order.n_phi();
+  int n_theta   = order.n_theta();
+  int max_lags  = std::max(n_phi, n_theta);
+  int start_sum = std::max(max_lags, truncate);
+  int start_y   = start_sum - n_phi;
+  int start_v   = start_sum - n_theta;
+
+  // Compute the sum-of-squares and the log-likelihood
+  int n_warps            = std::max(raft::ceildiv<int>(max_lags, 32), 1);
+  size_t shared_mem_size = (2 * n_obs - start_y - start_v + n_warps) * sizeof(double);
+  sum_of_squares_kernel<<<batch_size, 32 * n_warps, shared_mem_size, stream>>>(d_y,
+                                                                               Tparams.mu,
+                                                                               Tparams.ar,
+                                                                               Tparams.ma,
+                                                                               Tparams.sar,
+                                                                               Tparams.sma,
+                                                                               d_loglike,
+                                                                               n_obs,
+                                                                               n_phi,
+                                                                               n_theta,
+                                                                               order.p,
+                                                                               order.q,
+                                                                               order.P,
+                                                                               order.Q,
+                                                                               order.s,
+                                                                               order.k,
+                                                                               start_sum,
+                                                                               start_y,
+                                                                               start_v);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
 
-    if (host_loglike) {
-      /* Tranfer log-likelihood device -> host */
-      raft::update_host(loglike, d_loglike, batch_size, stream);
-    }
+void batched_loglike(raft::handle_t& handle,
+                     const ARIMAMemory<double>& arima_mem,
+                     const double* d_y,
+                     const double* d_exog,
+                     int batch_size,
+                     int n_obs,
+                     const ARIMAOrder& order,
+                     const ARIMAParams<double>& params,
+                     double* loglike,
+                     bool trans,
+                     bool host_loglike,
+                     LoglikeMethod method,
+                     int truncate,
+                     int fc_steps,
+                     double* d_fc,
+                     const double* d_exog_fut,
+                     double level,
+                     double* d_lower,
+                     double* d_upper)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+
+  auto stream = handle.get_stream();
+
+  double* d_pred = arima_mem.pred;
+
+  ARIMAParams<double> Tparams = {params.mu,
+                                 params.beta,
+                                 arima_mem.Tparams_ar,
+                                 arima_mem.Tparams_ma,
+                                 arima_mem.Tparams_sar,
+                                 arima_mem.Tparams_sma,
+                                 arima_mem.Tparams_sigma2};
+
+  ASSERT(method == MLE || fc_steps == 0, "Only MLE method is valid for forecasting");
+
+  /* Create log-likelihood device array if host pointer is provided */
+  double* d_loglike = host_loglike ? arima_mem.loglike : loglike;
+
+  if (trans) {
+    MLCommon::TimeSeries::batched_jones_transform(
+      order, batch_size, false, params, Tparams, stream);
+  } else {
+    // non-transformed case: just use original parameters
+    Tparams.ar     = params.ar;
+    Tparams.ma     = params.ma;
+    Tparams.sar    = params.sar;
+    Tparams.sma    = params.sma;
+    Tparams.sigma2 = params.sigma2;
   }
 
-  void batched_loglike(raft::handle_t & handle,
-                       const ARIMAMemory<double>& arima_mem,
-                       const double* d_y,
-                       const double* d_exog,
-                       int batch_size,
-                       int n_obs,
-                       const ARIMAOrder& order,
-                       const double* d_params,
-                       double* loglike,
-                       bool trans,
-                       bool host_loglike,
-                       LoglikeMethod method,
-                       int truncate)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-
-    // unpack parameters
-    auto stream = handle.get_stream();
-
-    ARIMAParams<double> params = {arima_mem.params_mu,
-                                  arima_mem.params_beta,
-                                  arima_mem.params_ar,
-                                  arima_mem.params_ma,
-                                  arima_mem.params_sar,
-                                  arima_mem.params_sma,
-                                  arima_mem.params_sigma2};
-
-    params.unpack(order, batch_size, d_params, stream);
+  if (method == CSS) {
+    conditional_sum_of_squares(handle, d_y, batch_size, n_obs, order, Tparams, d_loglike, truncate);
+  } else {
+    batched_kalman_filter(handle,
+                          arima_mem,
+                          d_y,
+                          d_exog,
+                          n_obs,
+                          Tparams,
+                          order,
+                          batch_size,
+                          d_loglike,
+                          d_pred,
+                          fc_steps,
+                          d_fc,
+                          d_exog_fut,
+                          level,
+                          d_lower,
+                          d_upper);
+  }
 
-    batched_loglike(handle,
-                    arima_mem,
-                    d_y,
-                    d_exog,
-                    batch_size,
-                    n_obs,
-                    order,
-                    params,
-                    loglike,
-                    trans,
-                    host_loglike,
-                    method,
-                    truncate);
+  if (host_loglike) {
+    /* Tranfer log-likelihood device -> host */
+    raft::update_host(loglike, d_loglike, batch_size, stream);
   }
+}
 
-  void batched_loglike_grad(raft::handle_t & handle,
-                            const ARIMAMemory<double>& arima_mem,
-                            const double* d_y,
-                            const double* d_exog,
-                            int batch_size,
-                            int n_obs,
-                            const ARIMAOrder& order,
-                            const double* d_x,
-                            double* d_grad,
-                            double h,
-                            bool trans,
-                            LoglikeMethod method,
-                            int truncate)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-    auto stream   = handle.get_stream();
-    auto counting = thrust::make_counting_iterator(0);
-    int N         = order.complexity();
-
-    // Initialize the perturbed x vector
-    double* d_x_pert = arima_mem.x_pert;
-    raft::copy(d_x_pert, d_x, N * batch_size, stream);
-
-    double* d_ll_base = arima_mem.loglike_base;
-    double* d_ll_pert = arima_mem.loglike_pert;
-
-    // Evaluate the log-likelihood with the given parameter vector
-    batched_loglike(handle,
-                    arima_mem,
-                    d_y,
-                    d_exog,
-                    batch_size,
-                    n_obs,
-                    order,
-                    d_x,
-                    d_ll_base,
-                    trans,
-                    false,
-                    method,
-                    truncate);
+void batched_loglike(raft::handle_t& handle,
+                     const ARIMAMemory<double>& arima_mem,
+                     const double* d_y,
+                     const double* d_exog,
+                     int batch_size,
+                     int n_obs,
+                     const ARIMAOrder& order,
+                     const double* d_params,
+                     double* loglike,
+                     bool trans,
+                     bool host_loglike,
+                     LoglikeMethod method,
+                     int truncate)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+
+  // unpack parameters
+  auto stream = handle.get_stream();
+
+  ARIMAParams<double> params = {arima_mem.params_mu,
+                                arima_mem.params_beta,
+                                arima_mem.params_ar,
+                                arima_mem.params_ma,
+                                arima_mem.params_sar,
+                                arima_mem.params_sma,
+                                arima_mem.params_sigma2};
+
+  params.unpack(order, batch_size, d_params, stream);
+
+  batched_loglike(handle,
+                  arima_mem,
+                  d_y,
+                  d_exog,
+                  batch_size,
+                  n_obs,
+                  order,
+                  params,
+                  loglike,
+                  trans,
+                  host_loglike,
+                  method,
+                  truncate);
+}
+
+void batched_loglike_grad(raft::handle_t& handle,
+                          const ARIMAMemory<double>& arima_mem,
+                          const double* d_y,
+                          const double* d_exog,
+                          int batch_size,
+                          int n_obs,
+                          const ARIMAOrder& order,
+                          const double* d_x,
+                          double* d_grad,
+                          double h,
+                          bool trans,
+                          LoglikeMethod method,
+                          int truncate)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  auto stream   = handle.get_stream();
+  auto counting = thrust::make_counting_iterator(0);
+  int N         = order.complexity();
+
+  // Initialize the perturbed x vector
+  double* d_x_pert = arima_mem.x_pert;
+  raft::copy(d_x_pert, d_x, N * batch_size, stream);
+
+  double* d_ll_base = arima_mem.loglike_base;
+  double* d_ll_pert = arima_mem.loglike_pert;
+
+  // Evaluate the log-likelihood with the given parameter vector
+  batched_loglike(handle,
+                  arima_mem,
+                  d_y,
+                  d_exog,
+                  batch_size,
+                  n_obs,
+                  order,
+                  d_x,
+                  d_ll_base,
+                  trans,
+                  false,
+                  method,
+                  truncate);
 
 <<<<<<< Updated upstream
-    for (int i = 0; i < N; i++) {
-      // Add the perturbation to the i-th parameter
+  for (int i = 0; i < N; i++) {
+    // Add the perturbation to the i-th parameter
 =======
-    // First derivative with a first-order accuracy
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
-      });
+  // First derivative with a first-order accuracy
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
+    });
 
-    // Reset the i-th parameter
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        d_x_pert[N * bid + i] = d_x[N * bid + i];
-      });
-  }
+  // Reset the i-th parameter
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      d_x_pert[N * bid + i] = d_x[N * bid + i];
+    });
+}
 }
 
 void information_criterion(raft::handle_t& handle,
@@ -885,443 +866,443 @@ void _start_params(raft::handle_t& handle,
       double* d_exog_coef = bm_exog_coef.raw_data();
       const int& n_exog = order.n_exog;
 >>>>>>> Stashed changes
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          d_x_pert[N * bid + i] = d_x[N * bid + i] + h;
-        });
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_x_pert[N * bid + i] = d_x[N * bid + i] + h;
+      });
 
-      // Evaluate the log-likelihood with the positive perturbation
-      batched_loglike(handle,
-                      arima_mem,
-                      d_y,
-                      d_exog,
-                      batch_size,
-                      n_obs,
-                      order,
-                      d_x_pert,
-                      d_ll_pert,
-                      trans,
-                      false,
-                      method,
-                      truncate);
-
-      // First derivative with a first-order accuracy
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
-        });
+    // Evaluate the log-likelihood with the positive perturbation
+    batched_loglike(handle,
+                    arima_mem,
+                    d_y,
+                    d_exog,
+                    batch_size,
+                    n_obs,
+                    order,
+                    d_x_pert,
+                    d_ll_pert,
+                    trans,
+                    false,
+                    method,
+                    truncate);
 
-      // Reset the i-th parameter
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          d_x_pert[N * bid + i] = d_x[N * bid + i];
-        });
-    }
-  }
+    // First derivative with a first-order accuracy
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
+      });
 
-  void information_criterion(raft::handle_t & handle,
-                             const ARIMAMemory<double>& arima_mem,
-                             const double* d_y,
-                             const double* d_exog,
-                             int batch_size,
-                             int n_obs,
-                             const ARIMAOrder& order,
-                             const ARIMAParams<double>& params,
-                             double* d_ic,
-                             int ic_type)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-    auto stream = handle.get_stream();
-
-    /* Compute log-likelihood in d_ic */
-    batched_loglike(
-      handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
-
-    /* Compute information criterion from log-likelihood and base term */
-    raft::stats::information_criterion_batched(d_ic,
-                                               d_ic,
-                                               static_cast<raft::stats::IC_Type>(ic_type),
-                                               order.complexity(),
-                                               batch_size,
-                                               n_obs - order.n_diff(),
-                                               stream);
+    // Reset the i-th parameter
+    thrust::for_each(
+      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+        d_x_pert[N * bid + i] = d_x[N * bid + i];
+      });
   }
+}
 
-  /**
-   * Test that the parameters are valid for the inverse transform
-   *
-   * @tparam isAr        Are these (S)AR or (S)MA parameters?
-   * @param[in]  params  Parameters
-   * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
-   */
-  template <bool isAr>
-  DI bool test_invparams(const double* params, int pq)
-  {
-    double new_params[8];
-    double tmp[8];
-
-    constexpr double coef = isAr ? 1 : -1;
-
-    for (int i = 0; i < pq; i++) {
-      tmp[i]        = params[i];
-      new_params[i] = tmp[i];
-    }
+void information_criterion(raft::handle_t& handle,
+                           const ARIMAMemory<double>& arima_mem,
+                           const double* d_y,
+                           const double* d_exog,
+                           int batch_size,
+                           int n_obs,
+                           const ARIMAOrder& order,
+                           const ARIMAParams<double>& params,
+                           double* d_ic,
+                           int ic_type)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  auto stream = handle.get_stream();
 
-    // Perform inverse transform and stop before atanh step
-    for (int j = pq - 1; j > 0; --j) {
-      double a = new_params[j];
-      for (int k = 0; k < j; ++k) {
-        tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
-      }
-      for (int iter = 0; iter < j; ++iter) {
-        new_params[iter] = tmp[iter];
-      }
-    }
+  /* Compute log-likelihood in d_ic */
+  batched_loglike(
+    handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
 
-    // Verify that the values are between -1 and 1
-    bool result = true;
-    for (int i = 0; i < pq; i++) {
-      result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
-    }
-    return result;
+  /* Compute information criterion from log-likelihood and base term */
+  raft::stats::information_criterion_batched(d_ic,
+                                             d_ic,
+                                             static_cast<raft::stats::IC_Type>(ic_type),
+                                             order.complexity(),
+                                             batch_size,
+                                             n_obs - order.n_diff(),
+                                             stream);
+}
+
+/**
+ * Test that the parameters are valid for the inverse transform
+ *
+ * @tparam isAr        Are these (S)AR or (S)MA parameters?
+ * @param[in]  params  Parameters
+ * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
+ */
+template <bool isAr>
+DI bool test_invparams(const double* params, int pq)
+{
+  double new_params[8];
+  double tmp[8];
+
+  constexpr double coef = isAr ? 1 : -1;
+
+  for (int i = 0; i < pq; i++) {
+    tmp[i]        = params[i];
+    new_params[i] = tmp[i];
   }
 
-  /**
-   * Auxiliary function of _start_params: least square approximation of an
-   * ARMA model (with or without seasonality)
-   * @note: in this function the non-seasonal case has s=1, not s=0!
-   */
-  void _arma_least_squares(raft::handle_t & handle,
-                           double* d_ar,
-                           double* d_ma,
-                           double* d_sigma2,
-                           const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
-                           int p,
-                           int q,
-                           int s,
-                           bool estimate_sigma2,
-                           int k        = 0,
-                           double* d_mu = nullptr)
-  {
-    const auto& handle_impl = handle;
-    auto stream             = handle_impl.get_stream();
-    auto cublas_handle      = handle_impl.get_cublas_handle();
-    auto counting           = thrust::make_counting_iterator(0);
-
-    int batch_size = bm_y.batches();
-    int n_obs      = bm_y.shape().first;
-
-    int ps = p * s, qs = q * s;
-    int p_ar = std::max(ps, 2 * qs);
-    int r    = std::max(p_ar + qs, ps);
-
-    if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
-      // Too few observations for the estimate, fill with 0 (1 for sigma2)
-      if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
-      if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
-      if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
-      if (estimate_sigma2) {
-        thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
-        thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
-      }
-      return;
+  // Perform inverse transform and stop before atanh step
+  for (int j = pq - 1; j > 0; --j) {
+    double a = new_params[j];
+    for (int k = 0; k < j; ++k) {
+      tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
     }
-
-    /* Matrix formed by lag matrices of y and the residuals respectively,
-     * side by side. The left side will be used to estimate AR, the right
-     * side to estimate MA */
-    MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
-      n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
-    int ar_offset  = r - ps;
-    int res_offset = r - p_ar - qs;
-
-    // Get residuals from an AR(p_ar) model to estimate the MA parameters
-    if (q) {
-      // Create lagged y
-      int ls_height = n_obs - p_ar;
-      MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
-        MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
-
-      /* Matrix for the initial AR fit, initialized by copy of y
-       * (note: this is because gels works in-place ; the matrix has larger
-       *  dimensions than the actual AR fit) */
-      MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
-        MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
-
-      // Residual, initialized as offset y to avoid one kernel call
-      MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
-
-      // Initial AR fit
-      MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
-
-      // Compute residual (technically a gemv)
-      MLCommon::LinAlg::Batched::b_gemm(
-        false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
-
-      // Lags of the residual
-      MLCommon::LinAlg::Batched::b_lagged_mat(
-        bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
+    for (int iter = 0; iter < j; ++iter) {
+      new_params[iter] = tmp[iter];
     }
+  }
 
-    // Fill the first column of the matrix with 1 if we fit an intercept
-    if (k) {
-      double* d_ls_ar_res = bm_ls_ar_res.raw_data();
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
-          for (int i = 0; i < n_obs - r; i++) {
-            b_ls_ar_res[i] = 1.0;
-          }
-        });
-    }
+  // Verify that the values are between -1 and 1
+  bool result = true;
+  for (int i = 0; i < pq; i++) {
+    result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
+  }
+  return result;
+}
 
-    // Lags of y
-    MLCommon::LinAlg::Batched::b_lagged_mat(
-      bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
+/**
+ * Auxiliary function of _start_params: least square approximation of an
+ * ARMA model (with or without seasonality)
+ * @note: in this function the non-seasonal case has s=1, not s=0!
+ */
+void _arma_least_squares(raft::handle_t& handle,
+                         double* d_ar,
+                         double* d_ma,
+                         double* d_sigma2,
+                         const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
+                         int p,
+                         int q,
+                         int s,
+                         bool estimate_sigma2,
+                         int k        = 0,
+                         double* d_mu = nullptr)
+{
+  const auto& handle_impl = handle;
+  auto stream             = handle_impl.get_stream();
+  auto cublas_handle      = handle_impl.get_cublas_handle();
+  auto counting           = thrust::make_counting_iterator(0);
 
-    /* Initializing the vector for the ARMA fit
-     * (note: also in-place as described for AR fit) */
-    MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
-      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
+  int batch_size = bm_y.batches();
+  int n_obs      = bm_y.shape().first;
+
+  int ps = p * s, qs = q * s;
+  int p_ar = std::max(ps, 2 * qs);
+  int r    = std::max(p_ar + qs, ps);
 
-    // The residuals will be computed only if sigma2 is requested
-    MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
-      n_obs - r, 1, batch_size, cublas_handle, stream, false);
+  if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
+    // Too few observations for the estimate, fill with 0 (1 for sigma2)
+    if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
+    if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
+    if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
     if (estimate_sigma2) {
-      raft::copy(
-        bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
+      thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
+      thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
     }
+    return;
+  }
+
+  /* Matrix formed by lag matrices of y and the residuals respectively,
+   * side by side. The left side will be used to estimate AR, the right
+   * side to estimate MA */
+  MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
+    n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
+  int ar_offset  = r - ps;
+  int res_offset = r - p_ar - qs;
 
-    // ARMA fit
-    MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
+  // Get residuals from an AR(p_ar) model to estimate the MA parameters
+  if (q) {
+    // Create lagged y
+    int ls_height = n_obs - p_ar;
+    MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
+      MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
 
-    // Copy the results in the parameter vectors
-    const double* d_arma_fit = bm_arma_fit.raw_data();
+    /* Matrix for the initial AR fit, initialized by copy of y
+     * (note: this is because gels works in-place ; the matrix has larger
+     *  dimensions than the actual AR fit) */
+    MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
+      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
+
+    // Residual, initialized as offset y to avoid one kernel call
+    MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
+
+    // Initial AR fit
+    MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
+
+    // Compute residual (technically a gemv)
+    MLCommon::LinAlg::Batched::b_gemm(
+      false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
+
+    // Lags of the residual
+    MLCommon::LinAlg::Batched::b_lagged_mat(
+      bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
+  }
+
+  // Fill the first column of the matrix with 1 if we fit an intercept
+  if (k) {
+    double* d_ls_ar_res = bm_ls_ar_res.raw_data();
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
-        if (k) { d_mu[bid] = b_arma_fit[0]; }
-        if (p) {
-          double* b_ar = d_ar + bid * p;
-          for (int i = 0; i < p; i++) {
-            b_ar[i] = b_arma_fit[i + k];
-          }
-        }
-        if (q) {
-          double* b_ma = d_ma + bid * q;
-          for (int i = 0; i < q; i++) {
-            b_ma[i] = b_arma_fit[i + p + k];
-          }
+        double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
+        for (int i = 0; i < n_obs - r; i++) {
+          b_ls_ar_res[i] = 1.0;
         }
       });
+  }
 
-    if (estimate_sigma2) {
-      // Compute final residual (technically a gemv)
-      MLCommon::LinAlg::Batched::b_gemm(false,
-                                        false,
-                                        n_obs - r,
-                                        1,
-                                        p + q + k,
-                                        -1.0,
-                                        bm_ls_ar_res,
-                                        bm_arma_fit,
-                                        1.0,
-                                        bm_final_residual);
-
-      // Compute variance
-      double* d_residual = bm_final_residual.raw_data();
-      thrust::for_each(
-        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-          double acc               = 0.0;
-          const double* b_residual = d_residual + (n_obs - r) * bid;
-          for (int i = q; i < n_obs - r; i++) {
-            double res = b_residual[i];
-            acc += res * res;
-          }
-          d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
-        });
-    }
+  // Lags of y
+  MLCommon::LinAlg::Batched::b_lagged_mat(
+    bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
+
+  /* Initializing the vector for the ARMA fit
+   * (note: also in-place as described for AR fit) */
+  MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
+    MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
+
+  // The residuals will be computed only if sigma2 is requested
+  MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
+    n_obs - r, 1, batch_size, cublas_handle, stream, false);
+  if (estimate_sigma2) {
+    raft::copy(
+      bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
+  }
 
-    // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
+  // ARMA fit
+  MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
+
+  // Copy the results in the parameter vectors
+  const double* d_arma_fit = bm_arma_fit.raw_data();
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
+      if (k) { d_mu[bid] = b_arma_fit[0]; }
+      if (p) {
+        double* b_ar = d_ar + bid * p;
+        for (int i = 0; i < p; i++) {
+          b_ar[i] = b_arma_fit[i + k];
+        }
+      }
+      if (q) {
+        double* b_ma = d_ma + bid * q;
+        for (int i = 0; i < q; i++) {
+          b_ma[i] = b_arma_fit[i + p + k];
+        }
+      }
+    });
+
+  if (estimate_sigma2) {
+    // Compute final residual (technically a gemv)
+    MLCommon::LinAlg::Batched::b_gemm(false,
+                                      false,
+                                      n_obs - r,
+                                      1,
+                                      p + q + k,
+                                      -1.0,
+                                      bm_ls_ar_res,
+                                      bm_arma_fit,
+                                      1.0,
+                                      bm_final_residual);
+
+    // Compute variance
+    double* d_residual = bm_final_residual.raw_data();
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        if (p) {
-          double* b_ar = d_ar + bid * p;
-          bool valid   = test_invparams<true>(b_ar, p);
-          if (!valid) {
-            for (int ip = 0; ip < p; ip++)
-              b_ar[ip] = 0;
-          }
-        }
-        if (q) {
-          double* b_ma = d_ma + bid * q;
-          bool valid   = test_invparams<false>(b_ma, q);
-          if (!valid) {
-            for (int iq = 0; iq < q; iq++)
-              b_ma[iq] = 0;
-          }
+        double acc               = 0.0;
+        const double* b_residual = d_residual + (n_obs - r) * bid;
+        for (int i = q; i < n_obs - r; i++) {
+          double res = b_residual[i];
+          acc += res * res;
         }
+        d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
       });
   }
 
-  /**
-   * Auxiliary function of estimate_x0: compute the starting parameters for
-   * the series pre-processed by estimate_x0
-   */
-  void _start_params(raft::handle_t & handle,
-                     ARIMAParams<double> & params,
-                     MLCommon::LinAlg::Batched::Matrix<double> & bm_y,
-                     const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
-                     const ARIMAOrder& order)
-  {
-    int batch_size      = bm_exog.batches();
-    cudaStream_t stream = bm_exog.stream();
-
-    // Estimate exog coefficients and subtract component to endog.
-    // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
-    if (order.n_exog > 0) {
-      // In most cases, the system will be overdetermined and we can use gels
-      if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
-        // Make a copy of the exogenous series for in-place gels
-        MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
-        // Make a copy of the endogenous series for in-place gels
-        MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
-
-        // Least-squares solution of overdetermined system
-        rmm::device_uvector<int> info(batch_size, stream);
-        b_gels(bm_exog_copy, bm_y_copy, info.data());
-
-        // Make a batched matrix around the exogenous coefficients
-        rmm::device_uvector<double*> beta_pointers(batch_size, stream);
-        MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
-                                                               1,
-                                                               batch_size,
-                                                               bm_exog.cublasHandle(),
-                                                               beta_pointers.data(),
-                                                               params.beta,
-                                                               stream,
-                                                               false);
-
-        // Copy the solution of the system to the parameters array
-        b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
-
-        // Set parameters to zero when solving was not successful
-        auto counting       = thrust::make_counting_iterator(0);
-        int* devInfoArray   = info.data();
-        double* d_exog_coef = bm_exog_coef.raw_data();
-        const int& n_exog   = order.n_exog;
-        thrust::for_each(
-          thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-            if (devInfoArray[bid] > 0) {
-              for (int i = 0; i < n_exog; i++) {
-                d_exog_coef[bid * n_exog + i] = 0.0;
-              }
-            }
-          });
-
-        // Compute exogenous component and store the result in bm_y_copy
-        b_gemm(false,
-               false,
-               bm_exog.shape().first,
-               1,
-               bm_exog.shape().second,
-               1.0,
-               bm_exog,
-               bm_exog_coef,
-               0.0,
-               bm_y_copy);
-
-        // Subtract exogenous component to endogenous variable
-        b_aA_op_B(bm_y, bm_y_copy, bm_y, [] __device__(double a, double b) { return a - b; });
+  // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
+  thrust::for_each(
+    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+      if (p) {
+        double* b_ar = d_ar + bid * p;
+        bool valid   = test_invparams<true>(b_ar, p);
+        if (!valid) {
+          for (int ip = 0; ip < p; ip++)
+            b_ar[ip] = 0;
+        }
       }
-      // In other cases, we initialize to zero
-      else {
-        RAFT_CUDA_TRY(
-          cudaMemsetAsync(params.beta, 0, order.n_exog * batch_size * sizeof(double), stream));
+      if (q) {
+        double* b_ma = d_ma + bid * q;
+        bool valid   = test_invparams<false>(b_ma, q);
+        if (!valid) {
+          for (int iq = 0; iq < q; iq++)
+            b_ma[iq] = 0;
+        }
       }
-    }
+    });
+}
 
-    // Estimate an ARMA fit without seasonality
-    if (order.p + order.q + order.k)
-      _arma_least_squares(handle,
-                          params.ar,
-                          params.ma,
-                          params.sigma2,
-                          bm_y,
-                          order.p,
-                          order.q,
-                          1,
-                          true,
-                          order.k,
-                          params.mu);
-
-    // Estimate a seasonal ARMA fit independantly
-    if (order.P + order.Q)
-      _arma_least_squares(handle,
-                          params.sar,
-                          params.sma,
-                          params.sigma2,
-                          bm_y,
-                          order.P,
-                          order.Q,
-                          order.s,
-                          order.p + order.q + order.k == 0);
-  }
+/**
+ * Auxiliary function of estimate_x0: compute the starting parameters for
+ * the series pre-processed by estimate_x0
+ */
+void _start_params(raft::handle_t& handle,
+                   ARIMAParams<double>& params,
+                   MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
+                   const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
+                   const ARIMAOrder& order)
+{
+  int batch_size      = bm_exog.batches();
+  cudaStream_t stream = bm_exog.stream();
 
-  void estimate_x0(raft::handle_t & handle,
-                   ARIMAParams<double> & params,
-                   const double* d_y,
-                   const double* d_exog,
-                   int batch_size,
-                   int n_obs,
-                   const ARIMAOrder& order,
-                   bool missing)
-  {
-    raft::common::nvtx::range fun_scope(__func__);
-    const auto& handle_impl = handle;
-    auto stream             = handle_impl.get_stream();
-    auto cublas_handle      = handle_impl.get_cublas_handle();
-
-    /// TODO: solve exogenous coefficients with only valid rows instead of interpolation?
-    // Pros: better coefficients
-    // Cons: harder to test, a bit more complicated
-
-    // Least squares can't deal with missing values: create copy with naive
-    // replacements for missing values
-    const double* d_y_no_missing;
-    rmm::device_uvector<double> y_no_missing(0, stream);
-    if (missing) {
-      y_no_missing.resize(n_obs * batch_size, stream);
-      d_y_no_missing = y_no_missing.data();
-
-      raft::copy(y_no_missing.data(), d_y, n_obs * batch_size, stream);
-      MLCommon::TimeSeries::fillna(y_no_missing.data(), batch_size, n_obs, stream);
-    } else {
-      d_y_no_missing = d_y;
-    }
+  // Estimate exog coefficients and subtract component to endog.
+  // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
+  if (order.n_exog > 0) {
+    // In most cases, the system will be overdetermined and we can use gels
+    if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
+      // Make a copy of the exogenous series for in-place gels
+      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
+      // Make a copy of the endogenous series for in-place gels
+      MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
 
-    // Difference if necessary, copy otherwise
-    MLCommon::LinAlg::Batched::Matrix<double> bm_yd(
-      n_obs - order.d - order.s * order.D, 1, batch_size, cublas_handle, stream, false);
-    MLCommon::TimeSeries::prepare_data(
-      bm_yd.raw_data(), d_y_no_missing, batch_size, n_obs, order.d, order.D, order.s, stream);
+      // Least-squares solution of overdetermined system
+      rmm::device_uvector<int> info(batch_size, stream);
+      b_gels(bm_exog_copy, bm_y_copy, info.data());
 
-    // Difference or copy exog
-    MLCommon::LinAlg::Batched::Matrix<double> bm_exog_diff(
-      n_obs - order.d - order.s * order.D, order.n_exog, batch_size, cublas_handle, stream, false);
-    if (order.n_exog > 0) {
-      MLCommon::TimeSeries::prepare_data(bm_exog_diff.raw_data(),
-                                         d_exog,
-                                         order.n_exog * batch_size,
-                                         n_obs,
-                                         order.d,
-                                         order.D,
-                                         order.s,
-                                         stream);
+      // Make a batched matrix around the exogenous coefficients
+      rmm::device_uvector<double*> beta_pointers(batch_size, stream);
+      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
+                                                             1,
+                                                             batch_size,
+                                                             bm_exog.cublasHandle(),
+                                                             beta_pointers.data(),
+                                                             params.beta,
+                                                             stream,
+                                                             false);
+
+      // Copy the solution of the system to the parameters array
+      b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
+
+      // Set parameters to zero when solving was not successful
+      auto counting       = thrust::make_counting_iterator(0);
+      int* devInfoArray   = info.data();
+      double* d_exog_coef = bm_exog_coef.raw_data();
+      const int& n_exog   = order.n_exog;
+      thrust::for_each(
+        thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
+          if (devInfoArray[bid] > 0) {
+            for (int i = 0; i < n_exog; i++) {
+              d_exog_coef[bid * n_exog + i] = 0.0;
+            }
+          }
+        });
+
+      // Compute exogenous component and store the result in bm_y_copy
+      b_gemm(false,
+             false,
+             bm_exog.shape().first,
+             1,
+             bm_exog.shape().second,
+             1.0,
+             bm_exog,
+             bm_exog_coef,
+             0.0,
+             bm_y_copy);
+
+      // Subtract exogenous component to endogenous variable
+      b_aA_op_B(bm_y, bm_y_copy, bm_y, [] __device__(double a, double b) { return a - b; });
     }
+    // In other cases, we initialize to zero
+    else {
+      RAFT_CUDA_TRY(
+        cudaMemsetAsync(params.beta, 0, order.n_exog * batch_size * sizeof(double), stream));
+    }
+  }
 
-    // Do the computation of the initial parameters
-    _start_params(handle, params, bm_yd, bm_exog_diff, order);
+  // Estimate an ARMA fit without seasonality
+  if (order.p + order.q + order.k)
+    _arma_least_squares(handle,
+                        params.ar,
+                        params.ma,
+                        params.sigma2,
+                        bm_y,
+                        order.p,
+                        order.q,
+                        1,
+                        true,
+                        order.k,
+                        params.mu);
+
+  // Estimate a seasonal ARMA fit independantly
+  if (order.P + order.Q)
+    _arma_least_squares(handle,
+                        params.sar,
+                        params.sma,
+                        params.sigma2,
+                        bm_y,
+                        order.P,
+                        order.Q,
+                        order.s,
+                        order.p + order.q + order.k == 0);
+}
+
+void estimate_x0(raft::handle_t& handle,
+                 ARIMAParams<double>& params,
+                 const double* d_y,
+                 const double* d_exog,
+                 int batch_size,
+                 int n_obs,
+                 const ARIMAOrder& order,
+                 bool missing)
+{
+  raft::common::nvtx::range fun_scope(__func__);
+  const auto& handle_impl = handle;
+  auto stream             = handle_impl.get_stream();
+  auto cublas_handle      = handle_impl.get_cublas_handle();
+
+  /// TODO: solve exogenous coefficients with only valid rows instead of interpolation?
+  // Pros: better coefficients
+  // Cons: harder to test, a bit more complicated
+
+  // Least squares can't deal with missing values: create copy with naive
+  // replacements for missing values
+  const double* d_y_no_missing;
+  rmm::device_uvector<double> y_no_missing(0, stream);
+  if (missing) {
+    y_no_missing.resize(n_obs * batch_size, stream);
+    d_y_no_missing = y_no_missing.data();
+
+    raft::copy(y_no_missing.data(), d_y, n_obs * batch_size, stream);
+    MLCommon::TimeSeries::fillna(y_no_missing.data(), batch_size, n_obs, stream);
+  } else {
+    d_y_no_missing = d_y;
   }
 
+  // Difference if necessary, copy otherwise
+  MLCommon::LinAlg::Batched::Matrix<double> bm_yd(
+    n_obs - order.d - order.s * order.D, 1, batch_size, cublas_handle, stream, false);
+  MLCommon::TimeSeries::prepare_data(
+    bm_yd.raw_data(), d_y_no_missing, batch_size, n_obs, order.d, order.D, order.s, stream);
+
+  // Difference or copy exog
+  MLCommon::LinAlg::Batched::Matrix<double> bm_exog_diff(
+    n_obs - order.d - order.s * order.D, order.n_exog, batch_size, cublas_handle, stream, false);
+  if (order.n_exog > 0) {
+    MLCommon::TimeSeries::prepare_data(bm_exog_diff.raw_data(),
+                                       d_exog,
+                                       order.n_exog * batch_size,
+                                       n_obs,
+                                       order.d,
+                                       order.D,
+                                       order.s,
+                                       stream);
+  }
+
+  // Do the computation of the initial parameters
+  _start_params(handle, params, bm_yd, bm_exog_diff, order);
+}
+
 }  // namespace ML

From 1fe951bd4331cddeb408dfdc5500396444a448f2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sat, 15 Oct 2022 15:17:48 -0400
Subject: [PATCH 32/38] Still fixing bad merge

---
 cpp/src/arima/batched_arima.cu | 308 ---------------------------------
 1 file changed, 308 deletions(-)

diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index b65bb35033..aad5825936 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -556,316 +556,8 @@ void batched_loglike_grad(raft::handle_t& handle,
                   method,
                   truncate);
 
-<<<<<<< Updated upstream
   for (int i = 0; i < N; i++) {
     // Add the perturbation to the i-th parameter
-=======
-  // First derivative with a first-order accuracy
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      d_grad[N * bid + i] = (d_ll_pert[bid] - d_ll_base[bid]) / h;
-    });
-
-  // Reset the i-th parameter
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      d_x_pert[N * bid + i] = d_x[N * bid + i];
-    });
-}
-}
-
-void information_criterion(raft::handle_t& handle,
-                           const ARIMAMemory<double>& arima_mem,
-                           const double* d_y,
-                           const double* d_exog,
-                           int batch_size,
-                           int n_obs,
-                           const ARIMAOrder& order,
-                           const ARIMAParams<double>& params,
-                           double* d_ic,
-                           int ic_type)
-{
-  raft::common::nvtx::range fun_scope(__func__);
-  auto stream = handle.get_stream();
-
-  /* Compute log-likelihood in d_ic */
-  batched_loglike(
-    handle, arima_mem, d_y, d_exog, batch_size, n_obs, order, params, d_ic, false, false, MLE);
-
-  /* Compute information criterion from log-likelihood and base term */
-  raft::stats::information_criterion_batched(handle,
-                                             d_ic,
-                                             d_ic,
-                                             static_cast<MLCommon::Metrics::IC_Type>(ic_type),
-                                             order.complexity(),
-                                             batch_size,
-                                             n_obs - order.n_diff());
-}
-
-/**
- * Test that the parameters are valid for the inverse transform
- *
- * @tparam isAr        Are these (S)AR or (S)MA parameters?
- * @param[in]  params  Parameters
- * @param[in]  pq      p for AR, q for MA, P for SAR, Q for SMA
- */
-template <bool isAr>
-DI bool test_invparams(const double* params, int pq)
-{
-  double new_params[8];
-  double tmp[8];
-
-  constexpr double coef = isAr ? 1 : -1;
-
-  for (int i = 0; i < pq; i++) {
-    tmp[i] = params[i];
-    new_params[i] = tmp[i];
-  }
-
-  // Perform inverse transform and stop before atanh step
-  for (int j = pq - 1; j > 0; --j) {
-    double a = new_params[j];
-    for (int k = 0; k < j; ++k) {
-      tmp[k] = (new_params[k] + coef * a * new_params[j - k - 1]) / (1 - (a * a));
-    }
-    for (int iter = 0; iter < j; ++iter) {
-      new_params[iter] = tmp[iter];
-    }
-  }
-
-  // Verify that the values are between -1 and 1
-  bool result = true;
-  for (int i = 0; i < pq; i++) {
-    result = result && !(new_params[i] <= -1 || new_params[i] >= 1);
-  }
-  return result;
-}
-
-/**
- * Auxiliary function of _start_params: least square approximation of an
- * ARMA model (with or without seasonality)
- * @note: in this function the non-seasonal case has s=1, not s=0!
- */
-void _arma_least_squares(raft::handle_t& handle,
-                         double* d_ar,
-                         double* d_ma,
-                         double* d_sigma2,
-                         const MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
-                         int p,
-                         int q,
-                         int s,
-                         bool estimate_sigma2,
-                         int k = 0,
-                         double* d_mu = nullptr)
-{
-  const auto& handle_impl = handle;
-  auto stream = handle_impl.get_stream();
-  auto cublas_handle = handle_impl.get_cublas_handle();
-  auto counting = thrust::make_counting_iterator(0);
-
-  int batch_size = bm_y.batches();
-  int n_obs = bm_y.shape().first;
-
-  int ps = p * s, qs = q * s;
-  int p_ar = std::max(ps, 2 * qs);
-  int r = std::max(p_ar + qs, ps);
-
-  if ((q && p_ar >= n_obs - p_ar) || p + q + k >= n_obs - r) {
-    // Too few observations for the estimate, fill with 0 (1 for sigma2)
-    if (k) RAFT_CUDA_TRY(cudaMemsetAsync(d_mu, 0, sizeof(double) * batch_size, stream));
-    if (p) RAFT_CUDA_TRY(cudaMemsetAsync(d_ar, 0, sizeof(double) * p * batch_size, stream));
-    if (q) RAFT_CUDA_TRY(cudaMemsetAsync(d_ma, 0, sizeof(double) * q * batch_size, stream));
-    if (estimate_sigma2) {
-      thrust::device_ptr<double> sigma2_thrust = thrust::device_pointer_cast(d_sigma2);
-      thrust::fill(thrust::cuda::par.on(stream), sigma2_thrust, sigma2_thrust + batch_size, 1.0);
-    }
-    return;
-  }
-
-  /* Matrix formed by lag matrices of y and the residuals respectively,
-   * side by side. The left side will be used to estimate AR, the right
-   * side to estimate MA */
-  MLCommon::LinAlg::Batched::Matrix<double> bm_ls_ar_res(
-    n_obs - r, p + q + k, batch_size, cublas_handle, stream, false);
-  int ar_offset = r - ps;
-  int res_offset = r - p_ar - qs;
-
-  // Get residuals from an AR(p_ar) model to estimate the MA parameters
-  if (q) {
-    // Create lagged y
-    int ls_height = n_obs - p_ar;
-    MLCommon::LinAlg::Batched::Matrix<double> bm_ls =
-      MLCommon::LinAlg::Batched::b_lagged_mat(bm_y, p_ar);
-
-    /* Matrix for the initial AR fit, initialized by copy of y
-     * (note: this is because gels works in-place ; the matrix has larger
-     *  dimensions than the actual AR fit) */
-    MLCommon::LinAlg::Batched::Matrix<double> bm_ar_fit =
-      MLCommon::LinAlg::Batched::b_2dcopy(bm_y, p_ar, 0, ls_height, 1);
-
-    // Residual, initialized as offset y to avoid one kernel call
-    MLCommon::LinAlg::Batched::Matrix<double> bm_residual(bm_ar_fit);
-
-    // Initial AR fit
-    MLCommon::LinAlg::Batched::b_gels(bm_ls, bm_ar_fit);
-
-    // Compute residual (technically a gemv)
-    MLCommon::LinAlg::Batched::b_gemm(
-      false, false, ls_height, 1, p_ar, -1.0, bm_ls, bm_ar_fit, 1.0, bm_residual);
-
-    // Lags of the residual
-    MLCommon::LinAlg::Batched::b_lagged_mat(
-      bm_residual, bm_ls_ar_res, q, n_obs - r, res_offset, (n_obs - r) * (k + p), s);
-  }
-
-  // Fill the first column of the matrix with 1 if we fit an intercept
-  if (k) {
-    double* d_ls_ar_res = bm_ls_ar_res.raw_data();
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        double* b_ls_ar_res = d_ls_ar_res + bid * (n_obs - r) * (p + q + k);
-        for (int i = 0; i < n_obs - r; i++) {
-          b_ls_ar_res[i] = 1.0;
-        }
-      });
-  }
-
-  // Lags of y
-  MLCommon::LinAlg::Batched::b_lagged_mat(
-    bm_y, bm_ls_ar_res, p, n_obs - r, ar_offset, (n_obs - r) * k, s);
-
-  /* Initializing the vector for the ARMA fit
-   * (note: also in-place as described for AR fit) */
-  MLCommon::LinAlg::Batched::Matrix<double> bm_arma_fit =
-    MLCommon::LinAlg::Batched::b_2dcopy(bm_y, r, 0, n_obs - r, 1);
-
-  // The residuals will be computed only if sigma2 is requested
-  MLCommon::LinAlg::Batched::Matrix<double> bm_final_residual(
-    n_obs - r, 1, batch_size, cublas_handle, stream, false);
-  if (estimate_sigma2) {
-    raft::copy(
-      bm_final_residual.raw_data(), bm_arma_fit.raw_data(), (n_obs - r) * batch_size, stream);
-  }
-
-  // ARMA fit
-  MLCommon::LinAlg::Batched::b_gels(bm_ls_ar_res, bm_arma_fit);
-
-  // Copy the results in the parameter vectors
-  const double* d_arma_fit = bm_arma_fit.raw_data();
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      const double* b_arma_fit = d_arma_fit + bid * (n_obs - r);
-      if (k) { d_mu[bid] = b_arma_fit[0]; }
-      if (p) {
-        double* b_ar = d_ar + bid * p;
-        for (int i = 0; i < p; i++) {
-          b_ar[i] = b_arma_fit[i + k];
-        }
-      }
-      if (q) {
-        double* b_ma = d_ma + bid * q;
-        for (int i = 0; i < q; i++) {
-          b_ma[i] = b_arma_fit[i + p + k];
-        }
-      }
-    });
-
-  if (estimate_sigma2) {
-    // Compute final residual (technically a gemv)
-    MLCommon::LinAlg::Batched::b_gemm(false,
-                                      false,
-                                      n_obs - r,
-                                      1,
-                                      p + q + k,
-                                      -1.0,
-                                      bm_ls_ar_res,
-                                      bm_arma_fit,
-                                      1.0,
-                                      bm_final_residual);
-
-    // Compute variance
-    double* d_residual = bm_final_residual.raw_data();
-    thrust::for_each(
-      thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-        double acc = 0.0;
-        const double* b_residual = d_residual + (n_obs - r) * bid;
-        for (int i = q; i < n_obs - r; i++) {
-          double res = b_residual[i];
-          acc += res * res;
-        }
-        d_sigma2[bid] = acc / static_cast<double>(n_obs - r - q);
-      });
-  }
-
-  // If (S)AR or (S)MA are not valid for the inverse transform, set them to zero
-  thrust::for_each(
-    thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
-      if (p) {
-        double* b_ar = d_ar + bid * p;
-        bool valid = test_invparams<true>(b_ar, p);
-        if (!valid) {
-          for (int ip = 0; ip < p; ip++)
-            b_ar[ip] = 0;
-        }
-      }
-      if (q) {
-        double* b_ma = d_ma + bid * q;
-        bool valid = test_invparams<false>(b_ma, q);
-        if (!valid) {
-          for (int iq = 0; iq < q; iq++)
-            b_ma[iq] = 0;
-        }
-      }
-    });
-}
-
-/**
- * Auxiliary function of estimate_x0: compute the starting parameters for
- * the series pre-processed by estimate_x0
- */
-void _start_params(raft::handle_t& handle,
-                   ARIMAParams<double>& params,
-                   MLCommon::LinAlg::Batched::Matrix<double>& bm_y,
-                   const MLCommon::LinAlg::Batched::Matrix<double>& bm_exog,
-                   const ARIMAOrder& order)
-{
-  int batch_size = bm_exog.batches();
-  cudaStream_t stream = bm_exog.stream();
-
-  // Estimate exog coefficients and subtract component to endog.
-  // Exog coefficients are estimated by fitting a linear regression with X=exog, y=endog
-  if (order.n_exog > 0) {
-    // In most cases, the system will be overdetermined and we can use gels
-    if (bm_exog.shape().first > static_cast<unsigned int>(order.n_exog)) {
-      // Make a copy of the exogenous series for in-place gels
-      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_copy(bm_exog);
-      // Make a copy of the endogenous series for in-place gels
-      MLCommon::LinAlg::Batched::Matrix<double> bm_y_copy(bm_y);
-
-      // Least-squares solution of overdetermined system
-      rmm::device_uvector<int> info(batch_size, stream);
-      b_gels(bm_exog_copy, bm_y_copy, info.data());
-
-      // Make a batched matrix around the exogenous coefficients
-      rmm::device_uvector<double*> beta_pointers(batch_size, stream);
-      MLCommon::LinAlg::Batched::Matrix<double> bm_exog_coef(order.n_exog,
-                                                             1,
-                                                             batch_size,
-                                                             bm_exog.cublasHandle(),
-                                                             beta_pointers.data(),
-                                                             params.beta,
-                                                             stream,
-                                                             false);
-
-      // Copy the solution of the system to the parameters array
-      b_2dcopy(bm_y_copy, bm_exog_coef, 0, 0, order.n_exog, 1);
-
-      // Set parameters to zero when solving was not successful
-      auto counting = thrust::make_counting_iterator(0);
-      int* devInfoArray = info.data();
-      double* d_exog_coef = bm_exog_coef.raw_data();
-      const int& n_exog = order.n_exog;
->>>>>>> Stashed changes
     thrust::for_each(
       thrust::cuda::par.on(stream), counting, counting + batch_size, [=] __device__(int bid) {
         d_x_pert[N * bid + i] = d_x[N * bid + i] + h;

From 0d67a2978ce0aeb93c715f000eb85492d68d62a1 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 16 Oct 2022 13:29:02 -0400
Subject: [PATCH 33/38] Removing duplicated labels prims

---
 cpp/bench/sg/svc.cu                     |   16 +-
 cpp/bench/sg/svr.cu                     |   16 +-
 cpp/include/cuml/matrix/kernelparams.h  |    4 +-
 cpp/include/cuml/svm/svc.hpp            |   12 +-
 cpp/src/arima/batched_arima.cu          |    9 +-
 cpp/src/dbscan/mergelabels/runner.cuh   |    4 +-
 cpp/src/dbscan/runner.cuh               |    4 +-
 cpp/src/hdbscan/detail/reachability.cuh |  407 +++---
 cpp/src/kmeans/common.cuh               |   20 +-
 cpp/src/kmeans/kmeans_mg_impl.cuh       |    4 +-
 cpp/src/kmeans/sg_impl.cuh              |    4 +-
 cpp/src/knn/knn_opg_common.cuh          | 1729 +++++++++++------------
 cpp/src/knn/knn_sparse.cu               |   99 +-
 cpp/src/metrics/trustworthiness.cu      |   92 +-
 cpp/src/metrics/v_measure.cu            |    7 +-
 cpp/src/randomforest/randomforest.cuh   |  513 ++++---
 cpp/src/svm/kernelcache.cuh             |    4 +-
 cpp/src/svm/svc.cu                      |   12 +-
 cpp/src/svm/svc_impl.cuh                |    2 +-
 cpp/src/svm/svr.cu                      |    9 +-
 cpp/src/svm/svr_impl.cuh                |    7 +-
 cpp/src/tsne/tsne.cu                    |  108 +-
 cpp/src/umap/knn_graph/algo.cuh         |   33 +-
 cpp/src_prims/cache/cache.cuh           |  410 ------
 cpp/src_prims/cache/cache_util.cuh      |  366 -----
 cpp/src_prims/label/classlabels.cuh     |  176 ---
 cpp/src_prims/label/merge_labels.cuh    |  156 --
 cpp/src_prims/selection/knn.cuh         |    8 +-
 cpp/test/CMakeLists.txt                 |    2 -
 cpp/test/prims/cache.cu                 |  309 ----
 cpp/test/prims/label.cu                 |  105 --
 cpp/test/prims/merge_labels.cu          |  125 --
 cpp/test/sg/svc_test.cu                 |    3 +-
 cpp/test/sg/tsne_test.cu                |   11 +-
 cpp/test/sg/umap_parametrizable_test.cu |   16 +-
 35 files changed, 1554 insertions(+), 3248 deletions(-)
 delete mode 100644 cpp/src_prims/cache/cache.cuh
 delete mode 100644 cpp/src_prims/cache/cache_util.cuh
 delete mode 100644 cpp/src_prims/label/classlabels.cuh
 delete mode 100644 cpp/src_prims/label/merge_labels.cuh
 delete mode 100644 cpp/test/prims/cache.cu
 delete mode 100644 cpp/test/prims/label.cu
 delete mode 100644 cpp/test/prims/merge_labels.cu

diff --git a/cpp/bench/sg/svc.cu b/cpp/bench/sg/svc.cu
index 8b8b391ece..6fcceb979f 100644
--- a/cpp/bench/sg/svc.cu
+++ b/cpp/bench/sg/svc.cu
@@ -20,10 +20,10 @@
 
 #include "benchmark.cuh"
 #include <cmath>
-#include <cuml/matrix/kernelparams.h>
 #include <cuml/svm/svc.hpp>
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
+#include <raft/distance/kernels.cuh>
 #include <sstream>
 #include <utility>
 
@@ -35,7 +35,7 @@ template <typename D>
 struct SvcParams {
   DatasetParams data;
   BlobsParams blobs;
-  MLCommon::Matrix::KernelParams kernel;
+  raft::distance::kernels::KernelParams kernel;
   ML::SVM::SvmParameter svm_param;
   ML::SVM::SvmModel<D> model;
 };
@@ -78,7 +78,7 @@ class SVC : public BlobsFixture<D, D> {
   }
 
  private:
-  MLCommon::Matrix::KernelParams kernel;
+  raft::distance::kernels::KernelParams kernel;
   ML::SVM::SvmParameter svm_param;
   ML::SVM::SvmModel<D> model;
 };
@@ -106,11 +106,11 @@ std::vector<SvcParams<D>> getInputs()
 
   std::vector<Triplets> rowcols = {{50000, 2, 2}, {2048, 100000, 2}, {50000, 1000, 2}};
 
-  std::vector<MLCommon::Matrix::KernelParams> kernels{
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::LINEAR, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::POLYNOMIAL, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::RBF, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::TANH, 3, 0.1, 0}};
+  std::vector<raft::distance::kernels::KernelParams> kernels{
+    raft::distance::kernels::KernelParams{raft::distance::kernels::LINEAR, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::POLYNOMIAL, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::RBF, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::TANH, 3, 0.1, 0}};
 
   for (auto& rc : rowcols) {
     p.data.nrows    = rc.nrows;
diff --git a/cpp/bench/sg/svr.cu b/cpp/bench/sg/svr.cu
index 408dde9fd8..e3a7de45f3 100644
--- a/cpp/bench/sg/svr.cu
+++ b/cpp/bench/sg/svr.cu
@@ -20,11 +20,11 @@
 
 #include "benchmark.cuh"
 #include <cmath>
-#include <cuml/matrix/kernelparams.h>
 #include <cuml/svm/svc.hpp>
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
 #include <cuml/svm/svr.hpp>
+#include <raft/distance/kernels.cuh>
 #include <utility>
 
 namespace ML {
@@ -35,7 +35,7 @@ template <typename D>
 struct SvrParams {
   DatasetParams data;
   RegressionParams regression;
-  MLCommon::Matrix::KernelParams kernel;
+  raft::distance::kernels::KernelParams kernel;
   ML::SVM::SvmParameter svm_param;
   ML::SVM::SvmModel<D>* model;
 };
@@ -77,7 +77,7 @@ class SVR : public RegressionFixture<D> {
   }
 
  private:
-  MLCommon::Matrix::KernelParams kernel;
+  raft::distance::kernels::KernelParams kernel;
   ML::SVM::SvmParameter svm_param;
   ML::SVM::SvmModel<D>* model;
 };
@@ -108,11 +108,11 @@ std::vector<SvrParams<D>> getInputs()
 
   std::vector<Triplets> rowcols = {{50000, 2, 2}, {1024, 10000, 10}, {3000, 200, 200}};
 
-  std::vector<MLCommon::Matrix::KernelParams> kernels{
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::LINEAR, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::POLYNOMIAL, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::RBF, 3, 1, 0},
-    MLCommon::Matrix::KernelParams{MLCommon::Matrix::TANH, 3, 0.1, 0}};
+  std::vector<raft::distance::kernels::KernelParams> kernels{
+    raft::distance::kernels::KernelParams{raft::distance::kernels::LINEAR, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::POLYNOMIAL, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::RBF, 3, 1, 0},
+    raft::distance::kernels::KernelParams{raft::distance::kernels::TANH, 3, 0.1, 0}};
 
   for (auto& rc : rowcols) {
     p.data.nrows               = rc.nrows;
diff --git a/cpp/include/cuml/matrix/kernelparams.h b/cpp/include/cuml/matrix/kernelparams.h
index 5815405938..2a0d86e2ed 100644
--- a/cpp/include/cuml/matrix/kernelparams.h
+++ b/cpp/include/cuml/matrix/kernelparams.h
@@ -21,8 +21,8 @@
 namespace MLCommon {
 namespace Matrix {
 
-using raft::distance::KernelParams;
-using raft::distance::KernelType;
+using raft::distance::kernels::KernelParams;
+using raft::distance::kernels::KernelType;
 
 };  // end namespace Matrix
 };  // end namespace MLCommon
diff --git a/cpp/include/cuml/svm/svc.hpp b/cpp/include/cuml/svm/svc.hpp
index 5413929ab1..05a0b3bb2c 100644
--- a/cpp/include/cuml/svm/svc.hpp
+++ b/cpp/include/cuml/svm/svc.hpp
@@ -19,8 +19,8 @@
 #include "svm_model.h"
 #include "svm_parameter.h"
 #include <cuml/common/logger.hpp>
-#include <cuml/matrix/kernelparams.h>
 #include <raft/core/handle.hpp>
+#include <raft/distance/distance_types.hpp>
 
 // namespace raft {
 // class handle_t;
@@ -57,7 +57,7 @@ void svcFit(const raft::handle_t& handle,
             int n_cols,
             math_t* labels,
             const SvmParameter& param,
-            MLCommon::Matrix::KernelParams& kernel_params,
+            raft::distance::kernels::KernelParams& kernel_params,
             SvmModel<math_t>& model,
             const math_t* sample_weight);
 
@@ -95,7 +95,7 @@ void svcPredict(const raft::handle_t& handle,
                 math_t* input,
                 int n_rows,
                 int n_cols,
-                MLCommon::Matrix::KernelParams& kernel_params,
+                raft::distance::kernels::KernelParams& kernel_params,
                 const SvmModel<math_t>& model,
                 math_t* preds,
                 math_t buffer_size,
@@ -134,7 +134,7 @@ class SVC {
  public:
   // Public members for easier access during testing from Python.
 
-  MLCommon::Matrix::KernelParams kernel_params;
+  raft::distance::kernels::KernelParams kernel_params;
   SvmParameter param;
   SvmModel<math_t> model;
   /**
@@ -151,8 +151,8 @@ class SVC {
   SVC(raft::handle_t& handle,
       math_t C   = 1,
       math_t tol = 1.0e-3,
-      MLCommon::Matrix::KernelParams kernel_params =
-        MLCommon::Matrix::KernelParams{MLCommon::Matrix::LINEAR, 3, 1, 0},
+      raft::distance::kernels::KernelParams kernel_params =
+        raft::distance::kernels::KernelParams{raft::distance::kernels::LINEAR, 3, 1, 0},
       math_t cache_size  = 200,
       int max_iter       = -1,
       int nochange_steps = 1000,
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index aad5825936..4d0840c1d8 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -29,11 +29,13 @@
 #include <cuml/tsa/batched_arima.hpp>
 #include <cuml/tsa/batched_kalman.hpp>
 
+#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <raft/core/handle.hpp>
 #include <raft/core/nvtx.hpp>
-#include <raft/linalg/matrix_vector_op.hpp>
-#include <raft/stats/information_criterion.hpp>
+#include <raft/linalg/matrix_vector_op.cuh>
+#include <raft/stats/common.hpp>
+#include <raft/stats/information_criterion.cuh>
 #include <raft/util/cuda_utils.cuh>
 #include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
@@ -41,6 +43,7 @@
 #include <timeSeries/fillna.cuh>
 
 namespace ML {
+
 void pack(raft::handle_t& handle,
           const ARIMAParams<double>& params,
           const ARIMAOrder& order,
@@ -997,4 +1000,4 @@ void estimate_x0(raft::handle_t& handle,
   _start_params(handle, params, bm_yd, bm_exog_diff, order);
 }
 
-}  // namespace ML
+}  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/dbscan/mergelabels/runner.cuh b/cpp/src/dbscan/mergelabels/runner.cuh
index 4dc40e2d09..c4d109ae39 100644
--- a/cpp/src/dbscan/mergelabels/runner.cuh
+++ b/cpp/src/dbscan/mergelabels/runner.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <label/merge_labels.cuh>
+#include <raft/label/merge_labels.cuh>
 
 #include <raft/core/handle.hpp>
 namespace ML {
@@ -44,7 +44,7 @@ void run(const raft::handle_t& handle,
          Index_ N,
          cudaStream_t stream)
 {
-  MLCommon::Label::merge_labels<Index_, TPB_X>(labels_a, labels_b, mask, work_buffer, m, N, stream);
+  raft::label::merge_labels<Index_, TPB_X>(labels_a, labels_b, mask, work_buffer, m, N, stream);
 }
 
 }  // namespace MergeLabels
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 7a5fb34822..d4c6100361 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -23,9 +23,9 @@
 #include "mergelabels/tree_reduction.cuh"
 #include "vertexdeg/runner.cuh"
 #include <common/nvtx.hpp>
-#include <label/classlabels.cuh>
 #include <raft/core/cudart_utils.hpp>
 #include <raft/core/nvtx.hpp>
+#include <raft/label/classlabels.cuh>
 #include <raft/sparse/csr.hpp>
 
 #include <cuml/common/logger.hpp>
@@ -74,7 +74,7 @@ template <typename Index_ = int>
 void final_relabel(Index_* db_cluster, Index_ N, cudaStream_t stream)
 {
   Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-  MLCommon::Label::make_monotonic(
+  raft::label::make_monotonic(
     db_cluster, db_cluster, N, stream, [MAX_LABEL] __device__(Index_ val) {
       return val == MAX_LABEL;
     });
diff --git a/cpp/src/hdbscan/detail/reachability.cuh b/cpp/src/hdbscan/detail/reachability.cuh
index cd9688ec2e..692c8e0ec4 100644
--- a/cpp/src/hdbscan/detail/reachability.cuh
+++ b/cpp/src/hdbscan/detail/reachability.cuh
@@ -18,8 +18,8 @@
 
 #include "reachability_faiss.cuh"
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 #include <raft/linalg/unary_op.cuh>
 
@@ -31,225 +31,214 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuml/neighbors/knn.hpp>
-<<<<<<< HEAD
-#include <raft/distance/distance.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-=======
 #include <raft/distance/distance.cuh>
->>>>>>> branch-22.10
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/tuple.h>
 
-namespace ML
+namespace ML {
+namespace HDBSCAN {
+namespace detail {
+namespace Reachability {
+
+/**
+ * Extract core distances from KNN graph. This is essentially
+ * performing a knn_dists[:,min_pts]
+ * @tparam value_idx data type for integrals
+ * @tparam value_t data type for distance
+ * @tparam tpb block size for kernel
+ * @param[in] knn_dists knn distance array (size n * k)
+ * @param[in] min_samples this neighbor will be selected for core distances
+ * @param[in] n_neighbors the number of neighbors of each point in the knn graph
+ * @param[in] n number of samples
+ * @param[out] out output array (size n)
+ * @param[in] stream stream for which to order cuda operations
+ */
+template <typename value_idx, typename value_t, int tpb = 256>
+void core_distances(
+  value_t* knn_dists, int min_samples, int n_neighbors, size_t n, value_t* out, cudaStream_t stream)
 {
-  namespace HDBSCAN {
-  namespace detail {
-  namespace Reachability {
-
-  /**
-   * Extract core distances from KNN graph. This is essentially
-   * performing a knn_dists[:,min_pts]
-   * @tparam value_idx data type for integrals
-   * @tparam value_t data type for distance
-   * @tparam tpb block size for kernel
-   * @param[in] knn_dists knn distance array (size n * k)
-   * @param[in] min_samples this neighbor will be selected for core distances
-   * @param[in] n_neighbors the number of neighbors of each point in the knn graph
-   * @param[in] n number of samples
-   * @param[out] out output array (size n)
-   * @param[in] stream stream for which to order cuda operations
-   */
-  template <typename value_idx, typename value_t, int tpb = 256>
-  void core_distances(value_t* knn_dists,
-                      int min_samples,
-                      int n_neighbors,
-                      size_t n,
-                      value_t* out,
-                      cudaStream_t stream)
-  {
-    ASSERT(n_neighbors >= min_samples,
-           "the size of the neighborhood should be greater than or equal to min_samples");
-
-    int blocks = raft::ceildiv(n, (size_t)tpb);
+  ASSERT(n_neighbors >= min_samples,
+         "the size of the neighborhood should be greater than or equal to min_samples");
+
+  int blocks = raft::ceildiv(n, (size_t)tpb);
+
+  auto exec_policy = rmm::exec_policy(stream);
+
+  auto indices = thrust::make_counting_iterator<value_idx>(0);
+
+  thrust::transform(exec_policy, indices, indices + n, out, [=] __device__(value_idx row) {
+    return knn_dists[row * n_neighbors + (min_samples - 1)];
+  });
+}
+
+/**
+ * Wraps the brute force knn API, to be used for both training and prediction
+ * @tparam value_idx data type for integrals
+ * @tparam value_t data type for distance
+ * @param[in] handle raft handle for resource reuse
+ * @param[in] X input data points (size m * n)
+ * @param[out] inds nearest neighbor indices (size n_search_items * k)
+ * @param[out] dists nearest neighbor distances (size n_search_items * k)
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] search_items array of items to search of dimensionality D (size n_search_items * n)
+ * @param[in] n_search_items number of rows in search_items
+ * @param[in] k number of nearest neighbors
+ * @param[in] metric distance metric to use
+ */
+template <typename value_idx, typename value_t>
+void compute_knn(const raft::handle_t& handle,
+                 const value_t* X,
+                 value_idx* inds,
+                 value_t* dists,
+                 size_t m,
+                 size_t n,
+                 const value_t* search_items,
+                 size_t n_search_items,
+                 int k,
+                 raft::distance::DistanceType metric)
+{
+  auto stream      = handle.get_stream();
+  auto exec_policy = handle.get_thrust_policy();
+  std::vector<value_t*> inputs;
+  inputs.push_back(const_cast<value_t*>(X));
+
+  std::vector<int> sizes;
+  sizes.push_back(m);
+
+  // This is temporary. Once faiss is updated, we should be able to
+  // pass value_idx through to knn.
+  rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
+
+  // perform knn
+  brute_force_knn(handle,
+                  inputs,
+                  sizes,
+                  n,
+                  const_cast<value_t*>(search_items),
+                  n_search_items,
+                  int64_indices.data(),
+                  dists,
+                  k,
+                  true,
+                  true,
+                  metric);
+
+  // convert from current knn's 64-bit to 32-bit.
+  thrust::transform(exec_policy,
+                    int64_indices.data(),
+                    int64_indices.data() + int64_indices.size(),
+                    inds,
+                    [] __device__(int64_t in) -> value_idx { return in; });
+}
+
+/**
+ * Constructs a mutual reachability graph, which is a k-nearest neighbors
+ * graph projected into mutual reachability space using the following
+ * function for each data point, where core_distance is the distance
+ * to the kth neighbor: max(core_distance(a), core_distance(b), d(a, b))
+ *
+ * Unfortunately, points in the tails of the pdf (e.g. in sparse regions
+ * of the space) can have very large neighborhoods, which will impact
+ * nearby neighborhoods. Because of this, it's possible that the
+ * radius for points in the main mass, which might have a very small
+ * radius initially, to expand very large. As a result, the initial
+ * knn which was used to compute the core distances may no longer
+ * capture the actual neighborhoods after projection into mutual
+ * reachability space.
+ *
+ * For the experimental version, we execute the knn twice- once
+ * to compute the radii (core distances) and again to capture
+ * the final neighborhoods. Future iterations of this algorithm
+ * will work improve upon this "exact" version, by using
+ * more specialized data structures, such as space-partitioning
+ * structures. It has also been shown that approximate nearest
+ * neighbors can yield reasonable neighborhoods as the
+ * data sizes increase.
+ *
+ * @tparam value_idx
+ * @tparam value_t
+ * @param[in] handle raft handle for resource reuse
+ * @param[in] X input data points (size m * n)
+ * @param[in] m number of rows in X
+ * @param[in] n number of columns in X
+ * @param[in] metric distance metric to use
+ * @param[in] k neighborhood size
+ * @param[in] min_samples this neighborhood will be selected for core distances
+ * @param[in] alpha weight applied when internal distance is chosen for
+ *            mutual reachability (value of 1.0 disables the weighting)
+ * @param[out] indptr CSR indptr of output knn graph (size m + 1)
+ * @param[out] core_dists output core distances array (size m)
+ * @param[out] out COO object, uninitialized on entry, on exit it stores the
+ *             (symmetrized) maximum reachability distance for the k nearest
+ *             neighbors.
+ */
+template <typename value_idx, typename value_t>
+void mutual_reachability_graph(const raft::handle_t& handle,
+                               const value_t* X,
+                               size_t m,
+                               size_t n,
+                               raft::distance::DistanceType metric,
+                               int min_samples,
+                               value_t alpha,
+                               value_idx* indptr,
+                               value_t* core_dists,
+                               raft::sparse::COO<value_t, value_idx>& out)
+{
+  RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
+               "Currently only L2 expanded distance is supported");
 
-    auto exec_policy = rmm::exec_policy(stream);
+  auto stream      = handle.get_stream();
+  auto exec_policy = handle.get_thrust_policy();
 
-    auto indices = thrust::make_counting_iterator<value_idx>(0);
+  rmm::device_uvector<value_idx> coo_rows(min_samples * m, stream);
+  rmm::device_uvector<value_idx> inds(min_samples * m, stream);
+  rmm::device_uvector<value_t> dists(min_samples * m, stream);
 
-    thrust::transform(exec_policy, indices, indices + n, out, [=] __device__(value_idx row) {
-      return knn_dists[row * n_neighbors + (min_samples - 1)];
-    });
-  }
+  // perform knn
+  compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
 
-  /**
-   * Wraps the brute force knn API, to be used for both training and prediction
-   * @tparam value_idx data type for integrals
-   * @tparam value_t data type for distance
-   * @param[in] handle raft handle for resource reuse
-   * @param[in] X input data points (size m * n)
-   * @param[out] inds nearest neighbor indices (size n_search_items * k)
-   * @param[out] dists nearest neighbor distances (size n_search_items * k)
-   * @param[in] m number of rows in X
-   * @param[in] n number of columns in X
-   * @param[in] search_items array of items to search of dimensionality D (size n_search_items * n)
-   * @param[in] n_search_items number of rows in search_items
-   * @param[in] k number of nearest neighbors
-   * @param[in] metric distance metric to use
-   */
-  template <typename value_idx, typename value_t>
-  void compute_knn(const raft::handle_t& handle,
-                   const value_t* X,
-                   value_idx* inds,
-                   value_t* dists,
-                   size_t m,
-                   size_t n,
-                   const value_t* search_items,
-                   size_t n_search_items,
-                   int k,
-                   raft::distance::DistanceType metric)
-  {
-    auto stream      = handle.get_stream();
-    auto exec_policy = handle.get_thrust_policy();
-    std::vector<value_t*> inputs;
-    inputs.push_back(const_cast<value_t*>(X));
-
-    std::vector<int> sizes;
-    sizes.push_back(m);
-
-    // This is temporary. Once faiss is updated, we should be able to
-    // pass value_idx through to knn.
-    rmm::device_uvector<int64_t> int64_indices(k * n_search_items, stream);
-
-    // perform knn
-    brute_force_knn(handle,
-                    inputs,
-                    sizes,
-                    n,
-                    const_cast<value_t*>(search_items),
-                    n_search_items,
-                    int64_indices.data(),
-                    dists,
-                    k,
-                    true,
-                    true,
-                    metric);
-
-    // convert from current knn's 64-bit to 32-bit.
-    thrust::transform(exec_policy,
-                      int64_indices.data(),
-                      int64_indices.data() + int64_indices.size(),
-                      inds,
-                      [] __device__(int64_t in) -> value_idx { return in; });
-  }
+  // Slice core distances (distances to kth nearest neighbor)
+  core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
 
   /**
-   * Constructs a mutual reachability graph, which is a k-nearest neighbors
-   * graph projected into mutual reachability space using the following
-   * function for each data point, where core_distance is the distance
-   * to the kth neighbor: max(core_distance(a), core_distance(b), d(a, b))
-   *
-   * Unfortunately, points in the tails of the pdf (e.g. in sparse regions
-   * of the space) can have very large neighborhoods, which will impact
-   * nearby neighborhoods. Because of this, it's possible that the
-   * radius for points in the main mass, which might have a very small
-   * radius initially, to expand very large. As a result, the initial
-   * knn which was used to compute the core distances may no longer
-   * capture the actual neighborhoods after projection into mutual
-   * reachability space.
-   *
-   * For the experimental version, we execute the knn twice- once
-   * to compute the radii (core distances) and again to capture
-   * the final neighborhoods. Future iterations of this algorithm
-   * will work improve upon this "exact" version, by using
-   * more specialized data structures, such as space-partitioning
-   * structures. It has also been shown that approximate nearest
-   * neighbors can yield reasonable neighborhoods as the
-   * data sizes increase.
-   *
-   * @tparam value_idx
-   * @tparam value_t
-   * @param[in] handle raft handle for resource reuse
-   * @param[in] X input data points (size m * n)
-   * @param[in] m number of rows in X
-   * @param[in] n number of columns in X
-   * @param[in] metric distance metric to use
-   * @param[in] k neighborhood size
-   * @param[in] min_samples this neighborhood will be selected for core distances
-   * @param[in] alpha weight applied when internal distance is chosen for
-   *            mutual reachability (value of 1.0 disables the weighting)
-   * @param[out] indptr CSR indptr of output knn graph (size m + 1)
-   * @param[out] core_dists output core distances array (size m)
-   * @param[out] out COO object, uninitialized on entry, on exit it stores the
-   *             (symmetrized) maximum reachability distance for the k nearest
-   *             neighbors.
+   * Compute L2 norm
    */
-  template <typename value_idx, typename value_t>
-  void mutual_reachability_graph(const raft::handle_t& handle,
-                                 const value_t* X,
-                                 size_t m,
-                                 size_t n,
-                                 raft::distance::DistanceType metric,
-                                 int min_samples,
-                                 value_t alpha,
-                                 value_idx* indptr,
-                                 value_t* core_dists,
-                                 raft::sparse::COO<value_t, value_idx>& out)
-  {
-    RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
-                 "Currently only L2 expanded distance is supported");
-
-    auto stream      = handle.get_stream();
-    auto exec_policy = handle.get_thrust_policy();
-
-    rmm::device_uvector<value_idx> coo_rows(min_samples * m, stream);
-    rmm::device_uvector<value_idx> inds(min_samples * m, stream);
-    rmm::device_uvector<value_t> dists(min_samples * m, stream);
-
-    // perform knn
-    compute_knn(handle, X, inds.data(), dists.data(), m, n, X, m, min_samples, metric);
-
-    // Slice core distances (distances to kth nearest neighbor)
-    core_distances<value_idx>(dists.data(), min_samples, min_samples, m, core_dists, stream);
-
-    /**
-     * Compute L2 norm
-     */
-    mutual_reachability_knn_l2(
-      handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha);
-
-    // self-loops get max distance
-    auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
-    thrust::transform(
-      exec_policy,
-      coo_rows_counting_itr,
-      coo_rows_counting_itr + (m * min_samples),
-      coo_rows.data(),
-      [min_samples] __device__(value_idx c) -> value_idx { return c / min_samples; });
-
-    raft::sparse::linalg::symmetrize(
-      handle, coo_rows.data(), inds.data(), dists.data(), m, m, min_samples * m, out);
-
-    raft::sparse::convert::sorted_coo_to_csr(out.rows(), out.nnz, indptr, m + 1, stream);
-
-    // self-loops get max distance
-    auto transform_in =
-      thrust::make_zip_iterator(thrust::make_tuple(out.rows(), out.cols(), out.vals()));
-
-    thrust::transform(exec_policy,
-                      transform_in,
-                      transform_in + out.nnz,
-                      out.vals(),
-                      [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
-                        return thrust::get<0>(tup) == thrust::get<1>(tup)
-                                 ? std::numeric_limits<value_t>::max()
-                                 : thrust::get<2>(tup);
-                      });
-  }
-
-  };  // end namespace Reachability
-  };  // end namespace detail
-  };  // end namespace HDBSCAN
-};    // end namespace ML
+  mutual_reachability_knn_l2(
+    handle, inds.data(), dists.data(), X, m, n, min_samples, core_dists, (value_t)1.0 / alpha);
+
+  // self-loops get max distance
+  auto coo_rows_counting_itr = thrust::make_counting_iterator<value_idx>(0);
+  thrust::transform(exec_policy,
+                    coo_rows_counting_itr,
+                    coo_rows_counting_itr + (m * min_samples),
+                    coo_rows.data(),
+                    [min_samples] __device__(value_idx c) -> value_idx { return c / min_samples; });
+
+  raft::sparse::linalg::symmetrize(
+    handle, coo_rows.data(), inds.data(), dists.data(), m, m, min_samples * m, out);
+
+  raft::sparse::convert::sorted_coo_to_csr(out.rows(), out.nnz, indptr, m + 1, stream);
+
+  // self-loops get max distance
+  auto transform_in =
+    thrust::make_zip_iterator(thrust::make_tuple(out.rows(), out.cols(), out.vals()));
+
+  thrust::transform(exec_policy,
+                    transform_in,
+                    transform_in + out.nnz,
+                    out.vals(),
+                    [=] __device__(const thrust::tuple<value_idx, value_idx, value_t>& tup) {
+                      return thrust::get<0>(tup) == thrust::get<1>(tup)
+                               ? std::numeric_limits<value_t>::max()
+                               : thrust::get<2>(tup);
+                    });
+}
+
+};  // end namespace Reachability
+};  // end namespace detail
+};  // end namespace HDBSCAN
+};  // end namespace ML
\ No newline at end of file
diff --git a/cpp/src/kmeans/common.cuh b/cpp/src/kmeans/common.cuh
index 2ca5493363..f70325b7dd 100644
--- a/cpp/src/kmeans/common.cuh
+++ b/cpp/src/kmeans/common.cuh
@@ -24,19 +24,19 @@
 
 #include <common/tensor.hpp>
 
-#include <matrix/gather.cuh>
 #include <raft/linalg/reduce_cols_by_key.cuh>
 #include <raft/linalg/reduce_rows_by_key.cuh>
+#include <raft/matrix/gather.cuh>
 #include <raft/random/permute.cuh>
 
 #include <raft/core/comms.hpp>
-#include <raft/core/cudart_utils.hpp>
 #include <raft/distance/fused_l2_nn.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/matrix_vector_op.cuh>
 #include <raft/linalg/mean_squared_error.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -241,7 +241,7 @@ Tensor<DataT, 2, IndexT> sampleCentroids(const raft::handle_t& handle,
 
   Tensor<DataT, 2, IndexT> inRankCp({nPtsSampledInRank, n_features}, stream);
 
-  MLCommon::Matrix::gather(
+  raft::matrix::gather(
     X.data(),
     X.getSize(1),
     X.getSize(0),
@@ -601,13 +601,13 @@ void shuffleAndGather(const raft::handle_t& handle,
     raft::copy(indices.data(), ht_indices.data(), indices.numElements(), stream);
   }
 
-  MLCommon::Matrix::gather(in.data(),
-                           in.getSize(1),
-                           in.getSize(0),
-                           indices.data(),
-                           n_samples_to_gather,
-                           out.data(),
-                           stream);
+  raft::matrix::gather(in.data(),
+                       in.getSize(1),
+                       in.getSize(0),
+                       indices.data(),
+                       n_samples_to_gather,
+                       out.data(),
+                       stream);
 }
 
 template <typename DataT, typename IndexT>
diff --git a/cpp/src/kmeans/kmeans_mg_impl.cuh b/cpp/src/kmeans/kmeans_mg_impl.cuh
index af5427e765..59069403dc 100644
--- a/cpp/src/kmeans/kmeans_mg_impl.cuh
+++ b/cpp/src/kmeans/kmeans_mg_impl.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 #include <cuml/cluster/kmeans.hpp>
-#include <raft/core/cudart_utils.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -586,7 +586,7 @@ void fit(const raft::handle_t& handle,
 
     // copy the centroids[i] to newCentroids[i] when wtInCluster[i] is 0
     cub::ArgIndexInputIterator<DataT*> itr_wt(wtInCluster.data());
-    MLCommon::Matrix::gather_if(
+    raft::matrix::gather_if(
       centroids.data(),
       centroids.getSize(1),
       centroids.getSize(0),
diff --git a/cpp/src/kmeans/sg_impl.cuh b/cpp/src/kmeans/sg_impl.cuh
index ff308db2bd..5084f449fa 100644
--- a/cpp/src/kmeans/sg_impl.cuh
+++ b/cpp/src/kmeans/sg_impl.cuh
@@ -18,7 +18,7 @@
 
 #include "common.cuh"
 #include <cuml/cluster/kmeans.hpp>
-#include <raft/core/cudart_utils.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
@@ -176,7 +176,7 @@ void fit(const raft::handle_t& handle,
 
     // copy centroids[i] to newCentroids[i] when wtInCluster[i] is 0
     cub::ArgIndexInputIterator<DataT*> itr_wt(wtInCluster.data());
-    MLCommon::Matrix::gather_if(
+    raft::matrix::gather_if(
       centroids.data(),
       centroids.getSize(1),
       centroids.getSize(0),
diff --git a/cpp/src/knn/knn_opg_common.cuh b/cpp/src/knn/knn_opg_common.cuh
index 88db6858aa..dc2cb3c8a8 100644
--- a/cpp/src/knn/knn_opg_common.cuh
+++ b/cpp/src/knn/knn_opg_common.cuh
@@ -25,965 +25,952 @@
 #include <cumlprims/opg/matrix/part_descriptor.hpp>
 
 #include <raft/core/comms.hpp>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
-<<<<<<< HEAD
-#include <raft/cudart_utils.h>
-#include <raft/spatial/knn/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-=======
 #include <raft/spatial/knn/knn.cuh>
->>>>>>> branch-22.10
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 #include <cstddef>
 #include <memory>
 #include <set>
 
-namespace ML
-{
-  namespace KNN {
-  namespace opg {
-
-  namespace knn_common {
-
-  /**
-   * The enumeration of KNN distributed operations
-   */
-  enum knn_operation {
-    knn,            /**< Simple KNN */
-    classification, /**< KNN classification */
-    class_proba,    /**< KNN classification probabilities */
-    regression      /**< KNN regression */
-  };
-
-  /**
-   * A structure to store parameters for distributed KNN
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  struct opg_knn_param {
-    opg_knn_param(knn_operation knn_op,
-                  std::vector<Matrix::Data<in_t>*>* idx_data,
-                  Matrix::PartDescriptor* idx_desc,
-                  std::vector<Matrix::Data<in_t>*>* query_data,
-                  Matrix::PartDescriptor* query_desc,
-                  bool rowMajorIndex,
-                  bool rowMajorQuery,
-                  size_t k,
-                  size_t batch_size,
-                  bool verbose)
-    {
-      this->knn_op        = knn_op;
-      this->idx_data      = idx_data;
-      this->idx_desc      = idx_desc;
-      this->query_data    = query_data;
-      this->query_desc    = query_desc;
-      this->rowMajorIndex = rowMajorIndex;
-      this->rowMajorQuery = rowMajorQuery;
-      this->k             = k;
-      this->batch_size    = batch_size;
-      this->verbose       = verbose;
-    }
+namespace ML {
+namespace KNN {
+namespace opg {
 
-    knn_operation knn_op; /**< Type of KNN distributed operation */
-    std::vector<Matrix::Data<dist_t>*>* out_D    = nullptr; /**< KNN distances output array */
-    std::vector<Matrix::Data<ind_t>*>* out_I     = nullptr; /**< KNN indices output array */
-    std::vector<Matrix::Data<in_t>*>* idx_data   = nullptr; /**< Index input array */
-    Matrix::PartDescriptor* idx_desc             = nullptr; /**< Descriptor for index input array */
-    std::vector<Matrix::Data<in_t>*>* query_data = nullptr; /**< Query input array */
-    Matrix::PartDescriptor* query_desc           = nullptr; /**< Descriptor for query input array */
-    bool rowMajorIndex;                                     /**< Is index row major? */
-    bool rowMajorQuery;                                     /**< Is query row major? */
-    size_t k          = 0;                                  /**< Number of nearest neighbors */
-    size_t batch_size = 0;                                  /**< Batch size */
-    bool verbose;                                           /**< verbose */
-
-    std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
-    std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
-    std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
-
-    std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
-    std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */
-    std::vector<std::vector<float*>>* probas =
-      nullptr; /**< KNN classification probabilities output array (class-probas) */
-  };
-
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  struct KNN_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-    KNN_params(knn_operation knn_op,
-               std::vector<Matrix::Data<in_t>*>* idx_data,
-               Matrix::PartDescriptor* idx_desc,
-               std::vector<Matrix::Data<in_t>*>* query_data,
-               Matrix::PartDescriptor* query_desc,
-               bool rowMajorIndex,
-               bool rowMajorQuery,
-               size_t k,
-               size_t batch_size,
-               bool verbose,
-               std::vector<Matrix::Data<dist_t>*>* out_D,
-               std::vector<Matrix::Data<ind_t>*>* out_I)
-      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                  idx_data,
-                                                  idx_desc,
-                                                  query_data,
-                                                  query_desc,
-                                                  rowMajorIndex,
-                                                  rowMajorQuery,
-                                                  k,
-                                                  batch_size,
-                                                  verbose)
-    {
-      this->out_D = out_D;
-      this->out_I = out_I;
-    }
-  };
-
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  struct KNN_RE_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-    KNN_RE_params(knn_operation knn_op,
-                  std::vector<Matrix::Data<in_t>*>* idx_data,
-                  Matrix::PartDescriptor* idx_desc,
-                  std::vector<Matrix::Data<in_t>*>* query_data,
-                  Matrix::PartDescriptor* query_desc,
-                  bool rowMajorIndex,
-                  bool rowMajorQuery,
-                  size_t k,
-                  size_t batch_size,
-                  bool verbose,
-                  std::size_t n_outputs,
-                  std::vector<std::vector<out_t*>>* y,
-                  std::vector<Matrix::Data<out_t>*>* out)
-      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                  idx_data,
-                                                  idx_desc,
-                                                  query_data,
-                                                  query_desc,
-                                                  rowMajorIndex,
-                                                  rowMajorQuery,
-                                                  k,
-                                                  batch_size,
-                                                  verbose)
-    {
-      this->n_outputs = n_outputs;
-      this->y         = y;
-      this->out       = out;
-    }
-  };
-
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  struct KNN_CL_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
-    KNN_CL_params(knn_operation knn_op,
-                  std::vector<Matrix::Data<in_t>*>* idx_data,
-                  Matrix::PartDescriptor* idx_desc,
-                  std::vector<Matrix::Data<in_t>*>* query_data,
-                  Matrix::PartDescriptor* query_desc,
-                  bool rowMajorIndex,
-                  bool rowMajorQuery,
-                  size_t k,
-                  size_t batch_size,
-                  bool verbose,
-                  std::size_t n_outputs,
-                  std::vector<std::vector<out_t*>>* y,
-                  std::vector<int>* n_unique,
-                  std::vector<out_t*>* uniq_labels,
-                  std::vector<Matrix::Data<out_t>*>* out,
-                  std::vector<std::vector<float*>>* probas)
-      : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
-                                                  idx_data,
-                                                  idx_desc,
-                                                  query_data,
-                                                  query_desc,
-                                                  rowMajorIndex,
-                                                  rowMajorQuery,
-                                                  k,
-                                                  batch_size,
-                                                  verbose)
-    {
-      this->n_outputs   = n_outputs;
-      this->y           = y;
-      this->n_unique    = n_unique;
-      this->uniq_labels = uniq_labels;
-      this->out         = out;
-      this->probas      = probas;
-    }
-  };
-
-  /**
-   * A structure to store utilities for distributed KNN operations
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  struct opg_knn_work {
-    opg_knn_work(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
-      : res_D(0, handle.get_stream()), res_I(0, handle.get_stream()), res(0, handle.get_stream())
-    {
-      this->my_rank           = handle.get_comms().get_rank();
-      this->idxRanks          = params.idx_desc->uniqueRanks();
-      this->idxPartsToRanks   = params.idx_desc->partsToRanks;
-      this->local_idx_parts   = params.idx_desc->blocksOwnedBy(handle.get_comms().get_rank());
-      this->queryPartsToRanks = params.query_desc->partsToRanks;
-    }
+namespace knn_common {
 
-    int my_rank;            /**< Rank of this worker */
-    std::set<int> idxRanks; /**< Set of ranks having at least 1 index partition */
-    std::vector<Matrix::RankSizePair*> idxPartsToRanks;   /**< Index parts to rank */
-    std::vector<Matrix::RankSizePair*> local_idx_parts;   /**< List of index parts stored locally */
-    std::vector<Matrix::RankSizePair*> queryPartsToRanks; /**< Query parts to rank */
-
-    rmm::device_uvector<dist_t> res_D; /**< Temporary allocation to exchange distances */
-    rmm::device_uvector<ind_t> res_I;  /**< Temporary allocation to exchange indices */
-    rmm::device_uvector<out_t> res;    /**< Temporary allocation to exchange outputs (cl&re) */
-  };
-
-  /*!
-   Main function, computes distributed KNN operation
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] handle RAFT handle
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  void opg_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
+/**
+ * The enumeration of KNN distributed operations
+ */
+enum knn_operation {
+  knn,            /**< Simple KNN */
+  classification, /**< KNN classification */
+  class_proba,    /**< KNN classification probabilities */
+  regression      /**< KNN regression */
+};
+
+/**
+ * A structure to store parameters for distributed KNN
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct opg_knn_param {
+  opg_knn_param(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t>*>* idx_data,
+                Matrix::PartDescriptor* idx_desc,
+                std::vector<Matrix::Data<in_t>*>* query_data,
+                Matrix::PartDescriptor* query_desc,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                size_t k,
+                size_t batch_size,
+                bool verbose)
   {
-    opg_knn_work<in_t, ind_t, dist_t, out_t> work(params, handle);
-
-    ASSERT(params.k <= 1024, "k must be <= 1024");
-    ASSERT(params.batch_size > 0, "max_batch_size must be > 0");
-    ASSERT(params.k < params.idx_desc->M, "k must be less than the total number of query rows");
-    for (Matrix::RankSizePair* rsp : work.idxPartsToRanks) {
-      ASSERT(rsp->size >= params.k,
-             "k must be <= the number of rows in the smallest index partition.");
-    }
+    this->knn_op        = knn_op;
+    this->idx_data      = idx_data;
+    this->idx_desc      = idx_desc;
+    this->query_data    = query_data;
+    this->query_desc    = query_desc;
+    this->rowMajorIndex = rowMajorIndex;
+    this->rowMajorQuery = rowMajorQuery;
+    this->k             = k;
+    this->batch_size    = batch_size;
+    this->verbose       = verbose;
+  }
+
+  knn_operation knn_op;                                   /**< Type of KNN distributed operation */
+  std::vector<Matrix::Data<dist_t>*>* out_D    = nullptr; /**< KNN distances output array */
+  std::vector<Matrix::Data<ind_t>*>* out_I     = nullptr; /**< KNN indices output array */
+  std::vector<Matrix::Data<in_t>*>* idx_data   = nullptr; /**< Index input array */
+  Matrix::PartDescriptor* idx_desc             = nullptr; /**< Descriptor for index input array */
+  std::vector<Matrix::Data<in_t>*>* query_data = nullptr; /**< Query input array */
+  Matrix::PartDescriptor* query_desc           = nullptr; /**< Descriptor for query input array */
+  bool rowMajorIndex;                                     /**< Is index row major? */
+  bool rowMajorQuery;                                     /**< Is query row major? */
+  size_t k          = 0;                                  /**< Number of nearest neighbors */
+  size_t batch_size = 0;                                  /**< Batch size */
+  bool verbose;                                           /**< verbose */
+
+  std::size_t n_outputs = 0;              /**< Number of outputs per query (cl&re) */
+  std::vector<std::vector<out_t*>>* y;    /**< Labels input array (cl&re) */
+  std::vector<Matrix::Data<out_t>*>* out; /**< KNN outputs output array (cl&re) */
+
+  std::vector<int>* n_unique       = nullptr; /**< Number of unique labels (classification) */
+  std::vector<out_t*>* uniq_labels = nullptr; /**< Unique labels (classification) */
+  std::vector<std::vector<float*>>* probas =
+    nullptr; /**< KNN classification probabilities output array (class-probas) */
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_params(knn_operation knn_op,
+             std::vector<Matrix::Data<in_t>*>* idx_data,
+             Matrix::PartDescriptor* idx_desc,
+             std::vector<Matrix::Data<in_t>*>* query_data,
+             Matrix::PartDescriptor* query_desc,
+             bool rowMajorIndex,
+             bool rowMajorQuery,
+             size_t k,
+             size_t batch_size,
+             bool verbose,
+             std::vector<Matrix::Data<dist_t>*>* out_D,
+             std::vector<Matrix::Data<ind_t>*>* out_I)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                idx_data,
+                                                idx_desc,
+                                                query_data,
+                                                query_desc,
+                                                rowMajorIndex,
+                                                rowMajorQuery,
+                                                k,
+                                                batch_size,
+                                                verbose)
+  {
+    this->out_D = out_D;
+    this->out_I = out_I;
+  }
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_RE_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_RE_params(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t>*>* idx_data,
+                Matrix::PartDescriptor* idx_desc,
+                std::vector<Matrix::Data<in_t>*>* query_data,
+                Matrix::PartDescriptor* query_desc,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                size_t k,
+                size_t batch_size,
+                bool verbose,
+                std::size_t n_outputs,
+                std::vector<std::vector<out_t*>>* y,
+                std::vector<Matrix::Data<out_t>*>* out)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                idx_data,
+                                                idx_desc,
+                                                query_data,
+                                                query_desc,
+                                                rowMajorIndex,
+                                                rowMajorQuery,
+                                                k,
+                                                batch_size,
+                                                verbose)
+  {
+    this->n_outputs = n_outputs;
+    this->y         = y;
+    this->out       = out;
+  }
+};
+
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct KNN_CL_params : public opg_knn_param<in_t, ind_t, dist_t, out_t> {
+  KNN_CL_params(knn_operation knn_op,
+                std::vector<Matrix::Data<in_t>*>* idx_data,
+                Matrix::PartDescriptor* idx_desc,
+                std::vector<Matrix::Data<in_t>*>* query_data,
+                Matrix::PartDescriptor* query_desc,
+                bool rowMajorIndex,
+                bool rowMajorQuery,
+                size_t k,
+                size_t batch_size,
+                bool verbose,
+                std::size_t n_outputs,
+                std::vector<std::vector<out_t*>>* y,
+                std::vector<int>* n_unique,
+                std::vector<out_t*>* uniq_labels,
+                std::vector<Matrix::Data<out_t>*>* out,
+                std::vector<std::vector<float*>>* probas)
+    : opg_knn_param<in_t, ind_t, dist_t, out_t>(knn_op,
+                                                idx_data,
+                                                idx_desc,
+                                                query_data,
+                                                query_desc,
+                                                rowMajorIndex,
+                                                rowMajorQuery,
+                                                k,
+                                                batch_size,
+                                                verbose)
+  {
+    this->n_outputs   = n_outputs;
+    this->y           = y;
+    this->n_unique    = n_unique;
+    this->uniq_labels = uniq_labels;
+    this->out         = out;
+    this->probas      = probas;
+  }
+};
 
-    int local_parts_completed = 0;
-    // Loop through query parts for all ranks
-    for (int i = 0; i < params.query_desc->totalBlocks(); i++) {  // For each query partitions
-      Matrix::RankSizePair* partition = work.queryPartsToRanks[i];
-      int part_rank                   = partition->rank;
-      size_t part_n_rows              = partition->size;
+/**
+ * A structure to store utilities for distributed KNN operations
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+struct opg_knn_work {
+  opg_knn_work(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
+    : res_D(0, handle.get_stream()), res_I(0, handle.get_stream()), res(0, handle.get_stream())
+  {
+    this->my_rank           = handle.get_comms().get_rank();
+    this->idxRanks          = params.idx_desc->uniqueRanks();
+    this->idxPartsToRanks   = params.idx_desc->partsToRanks;
+    this->local_idx_parts   = params.idx_desc->blocksOwnedBy(handle.get_comms().get_rank());
+    this->queryPartsToRanks = params.query_desc->partsToRanks;
+  }
 
-      size_t total_batches     = raft::ceildiv(part_n_rows, params.batch_size);
-      size_t total_n_processed = 0;
+  int my_rank;            /**< Rank of this worker */
+  std::set<int> idxRanks; /**< Set of ranks having at least 1 index partition */
+  std::vector<Matrix::RankSizePair*> idxPartsToRanks;   /**< Index parts to rank */
+  std::vector<Matrix::RankSizePair*> local_idx_parts;   /**< List of index parts stored locally */
+  std::vector<Matrix::RankSizePair*> queryPartsToRanks; /**< Query parts to rank */
+
+  rmm::device_uvector<dist_t> res_D; /**< Temporary allocation to exchange distances */
+  rmm::device_uvector<ind_t> res_I;  /**< Temporary allocation to exchange indices */
+  rmm::device_uvector<out_t> res;    /**< Temporary allocation to exchange outputs (cl&re) */
+};
+
+/*!
+ Main function, computes distributed KNN operation
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] handle RAFT handle
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void opg_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params, raft::handle_t& handle)
+{
+  opg_knn_work<in_t, ind_t, dist_t, out_t> work(params, handle);
+
+  ASSERT(params.k <= 1024, "k must be <= 1024");
+  ASSERT(params.batch_size > 0, "max_batch_size must be > 0");
+  ASSERT(params.k < params.idx_desc->M, "k must be less than the total number of query rows");
+  for (Matrix::RankSizePair* rsp : work.idxPartsToRanks) {
+    ASSERT(rsp->size >= params.k,
+           "k must be <= the number of rows in the smallest index partition.");
+  }
 
-      // For each batch in a query partition
-      for (std::size_t cur_batch = 0; cur_batch < total_batches; cur_batch++) {
-        size_t cur_batch_size = params.batch_size;
+  int local_parts_completed = 0;
+  // Loop through query parts for all ranks
+  for (int i = 0; i < params.query_desc->totalBlocks(); i++) {  // For each query partitions
+    Matrix::RankSizePair* partition = work.queryPartsToRanks[i];
+    int part_rank                   = partition->rank;
+    size_t part_n_rows              = partition->size;
 
-        if (cur_batch == total_batches - 1)
-          cur_batch_size = part_n_rows - (cur_batch * params.batch_size);
+    size_t total_batches     = raft::ceildiv(part_n_rows, params.batch_size);
+    size_t total_n_processed = 0;
 
-        if (work.my_rank == part_rank) CUML_LOG_DEBUG("Root Rank is %d", work.my_rank);
+    // For each batch in a query partition
+    for (std::size_t cur_batch = 0; cur_batch < total_batches; cur_batch++) {
+      size_t cur_batch_size = params.batch_size;
 
-        /**
-         * Root broadcasts batch to all other ranks
-         */
-        CUML_LOG_DEBUG("Rank %d: Performing Broadcast", work.my_rank);
+      if (cur_batch == total_batches - 1)
+        cur_batch_size = part_n_rows - (cur_batch * params.batch_size);
 
-        rmm::device_uvector<in_t> part_data(0, handle.get_stream());
+      if (work.my_rank == part_rank) CUML_LOG_DEBUG("Root Rank is %d", work.my_rank);
 
-        size_t batch_input_elms   = cur_batch_size * params.query_desc->N;
-        size_t batch_input_offset = batch_input_elms * cur_batch;
+      /**
+       * Root broadcasts batch to all other ranks
+       */
+      CUML_LOG_DEBUG("Rank %d: Performing Broadcast", work.my_rank);
 
-        in_t* cur_query_ptr{nullptr};
+      rmm::device_uvector<in_t> part_data(0, handle.get_stream());
 
-        rmm::device_uvector<in_t> tmp_batch_buf(0, handle.get_stream());
-        // current partition's owner rank broadcasts
-        if (part_rank == work.my_rank) {
-          Matrix::Data<in_t>* data = params.query_data->at(local_parts_completed);
+      size_t batch_input_elms   = cur_batch_size * params.query_desc->N;
+      size_t batch_input_offset = batch_input_elms * cur_batch;
 
-          // If query is column major and total_batches > 0, create a
-          // temporary buffer for the batch so that we can stack rows.
-          if (!params.rowMajorQuery && total_batches > 1) {
-            tmp_batch_buf.resize(batch_input_elms, handle.get_stream());
-            for (std::size_t col_data = 0; col_data < params.query_desc->N; col_data++) {
-              raft::copy(tmp_batch_buf.data() + (col_data * cur_batch_size),
-                         data->ptr + ((col_data * part_n_rows) + total_n_processed),
-                         cur_batch_size,
-                         handle.get_stream());
-            }
-            cur_query_ptr = tmp_batch_buf.data();
+      in_t* cur_query_ptr{nullptr};
 
-          } else {
-            cur_query_ptr = data->ptr + batch_input_offset;
+      rmm::device_uvector<in_t> tmp_batch_buf(0, handle.get_stream());
+      // current partition's owner rank broadcasts
+      if (part_rank == work.my_rank) {
+        Matrix::Data<in_t>* data = params.query_data->at(local_parts_completed);
+
+        // If query is column major and total_batches > 0, create a
+        // temporary buffer for the batch so that we can stack rows.
+        if (!params.rowMajorQuery && total_batches > 1) {
+          tmp_batch_buf.resize(batch_input_elms, handle.get_stream());
+          for (std::size_t col_data = 0; col_data < params.query_desc->N; col_data++) {
+            raft::copy(tmp_batch_buf.data() + (col_data * cur_batch_size),
+                       data->ptr + ((col_data * part_n_rows) + total_n_processed),
+                       cur_batch_size,
+                       handle.get_stream());
           }
+          cur_query_ptr = tmp_batch_buf.data();
 
-          // all other (index) ranks receive
-        } else if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
-          part_data.resize(batch_input_elms, handle.get_stream());
-          cur_query_ptr = part_data.data();
+        } else {
+          cur_query_ptr = data->ptr + batch_input_offset;
         }
 
-        bool my_rank_is_idx = work.idxRanks.find(work.my_rank) != work.idxRanks.end();
-
-        /**
-         * Send query to index partitions
-         */
-        if (work.my_rank == part_rank || my_rank_is_idx)
-          broadcast_query(work, handle, part_rank, cur_query_ptr, batch_input_elms);
+        // all other (index) ranks receive
+      } else if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) {
+        part_data.resize(batch_input_elms, handle.get_stream());
+        cur_query_ptr = part_data.data();
+      }
 
-        if (my_rank_is_idx) {
-          /**
-           * All index ranks perform local KNN
-           */
-          CUML_LOG_DEBUG("Rank %d: Performing Local KNN", work.my_rank);
+      bool my_rank_is_idx = work.idxRanks.find(work.my_rank) != work.idxRanks.end();
 
-          size_t batch_knn_elms = params.k * cur_batch_size;
+      /**
+       * Send query to index partitions
+       */
+      if (work.my_rank == part_rank || my_rank_is_idx)
+        broadcast_query(work, handle, part_rank, cur_query_ptr, batch_input_elms);
 
-          if (params.knn_op != knn_operation::knn) {
-            // No labels for KNN only operation
-            work.res.resize(batch_knn_elms * params.n_outputs, handle.get_stream());
-          }
-          work.res_I.resize(batch_knn_elms, handle.get_stream());
-          work.res_D.resize(batch_knn_elms, handle.get_stream());
+      if (my_rank_is_idx) {
+        /**
+         * All index ranks perform local KNN
+         */
+        CUML_LOG_DEBUG("Rank %d: Performing Local KNN", work.my_rank);
 
-          // Perform a local KNN search
-          perform_local_knn(params, work, handle, cur_query_ptr, cur_batch_size);
+        size_t batch_knn_elms = params.k * cur_batch_size;
 
-          if (params.knn_op != knn_operation::knn) {
-            // Get the right labels for indices obtained after a KNN merge
-            copy_label_outputs_from_index_parts(params, work, handle, cur_batch_size);
-          }
+        if (params.knn_op != knn_operation::knn) {
+          // No labels for KNN only operation
+          work.res.resize(batch_knn_elms * params.n_outputs, handle.get_stream());
         }
+        work.res_I.resize(batch_knn_elms, handle.get_stream());
+        work.res_D.resize(batch_knn_elms, handle.get_stream());
 
-        if (part_rank == work.my_rank || my_rank_is_idx) {
-          /**
-           * Ranks exchange results.
-           * Each rank having index partition(s) sends
-           * its local results (my_rank_is_idx)
-           * Additionally the owner of currently processed query partition
-           * receives and performs a reduce even if it has
-           * no index partition (part_rank == my_rank)
-           */
-          CUML_LOG_DEBUG("Rank %d: Exchanging results", work.my_rank);
-          exchange_results(params, work, handle, part_rank, cur_batch_size);
+        // Perform a local KNN search
+        perform_local_knn(params, work, handle, cur_query_ptr, cur_batch_size);
+
+        if (params.knn_op != knn_operation::knn) {
+          // Get the right labels for indices obtained after a KNN merge
+          copy_label_outputs_from_index_parts(params, work, handle, cur_batch_size);
         }
+      }
 
+      if (part_rank == work.my_rank || my_rank_is_idx) {
         /**
-         * Root rank performs local reduce
+         * Ranks exchange results.
+         * Each rank having index partition(s) sends
+         * its local results (my_rank_is_idx)
+         * Additionally the owner of currently processed query partition
+         * receives and performs a reduce even if it has
+         * no index partition (part_rank == my_rank)
          */
-        if (part_rank == work.my_rank) {
-          CUML_LOG_DEBUG("Rank %d: Performing Reduce", work.my_rank);
-
-          // Reduce all local results to a global result for a given query batch
-          reduce(params, work, handle, local_parts_completed, total_n_processed, cur_batch_size);
-
-          CUML_LOG_DEBUG("Rank %d: Finished Reduce", work.my_rank);
-        }
-
-        total_n_processed += cur_batch_size;
+        CUML_LOG_DEBUG("Rank %d: Exchanging results", work.my_rank);
+        exchange_results(params, work, handle, part_rank, cur_batch_size);
       }
 
-      if (work.my_rank == part_rank) local_parts_completed++;
-    }
-  };
-
-  /*!
-   Broadcast query batch accross all the workers
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] handle RAFT handle
-   @param[in] part_rank Rank of currently processed query batch
-   @param[in] broadcast Pointer to broadcast
-   @param[in] broadcast_size Size of broadcast
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  void broadcast_query(opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                       raft::handle_t& handle,
-                       int part_rank,
-                       in_t* broadcast,
-                       size_t broadcast_size)
-  {
-    int request_idx = 0;
-    std::vector<raft::comms::request_t> requests;
-    if (part_rank == work.my_rank) {  // Either broadcast to other workers
-      int idx_rank_size = work.idxRanks.size();
-      if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) { --idx_rank_size; }
+      /**
+       * Root rank performs local reduce
+       */
+      if (part_rank == work.my_rank) {
+        CUML_LOG_DEBUG("Rank %d: Performing Reduce", work.my_rank);
 
-      requests.resize(idx_rank_size);
+        // Reduce all local results to a global result for a given query batch
+        reduce(params, work, handle, local_parts_completed, total_n_processed, cur_batch_size);
 
-      for (int rank : work.idxRanks) {
-        if (rank != work.my_rank) {
-          handle.get_comms().isend(
-            broadcast, broadcast_size, rank, 0, requests.data() + request_idx);
-          ++request_idx;
-        }
+        CUML_LOG_DEBUG("Rank %d: Finished Reduce", work.my_rank);
       }
 
-    } else {  // Or receive from broadcaster
-      requests.resize(1);
-      handle.get_comms().irecv(
-        broadcast, broadcast_size, part_rank, 0, requests.data() + request_idx);
-      ++request_idx;
+      total_n_processed += cur_batch_size;
     }
 
-    try {
-      handle.get_comms().waitall(requests.size(), requests.data());
-    } catch (raft::exception& e) {
-      CUML_LOG_DEBUG("FAILURE!");
-    }
+    if (work.my_rank == part_rank) local_parts_completed++;
   }
-
-  /*!
-   Perform a local KNN search for a given query batch
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[in] query Pointer to query
-   @param[in] query_size Size of query
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                         opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                         raft::handle_t& handle,
-                         in_t* query,
-                         size_t query_size)
-  {
-    std::vector<in_t*> ptrs(params.idx_data->size());
-    std::vector<std::size_t> sizes(params.idx_data->size());
-
-    for (std::size_t cur_idx = 0; cur_idx < params.idx_data->size(); cur_idx++) {
-      ptrs[cur_idx]  = params.idx_data->at(cur_idx)->ptr;
-      sizes[cur_idx] = work.local_idx_parts[cur_idx]->size;
+};
+
+/*!
+ Broadcast query batch accross all the workers
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] handle RAFT handle
+ @param[in] part_rank Rank of currently processed query batch
+ @param[in] broadcast Pointer to broadcast
+ @param[in] broadcast_size Size of broadcast
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void broadcast_query(opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                     raft::handle_t& handle,
+                     int part_rank,
+                     in_t* broadcast,
+                     size_t broadcast_size)
+{
+  int request_idx = 0;
+  std::vector<raft::comms::request_t> requests;
+  if (part_rank == work.my_rank) {  // Either broadcast to other workers
+    int idx_rank_size = work.idxRanks.size();
+    if (work.idxRanks.find(work.my_rank) != work.idxRanks.end()) { --idx_rank_size; }
+
+    requests.resize(idx_rank_size);
+
+    for (int rank : work.idxRanks) {
+      if (rank != work.my_rank) {
+        handle.get_comms().isend(broadcast, broadcast_size, rank, 0, requests.data() + request_idx);
+        ++request_idx;
+      }
     }
 
-    // Offset nearest neighbor index matrix by partition indices
-    std::vector<size_t> start_indices = params.idx_desc->startIndices(work.my_rank);
-    // PartDescriptor uses size_t while FAISS uses int64_t
-    // so we need to do a quick conversion.
-    std::vector<int64_t> start_indices_long;
-    for (size_t start_index : start_indices)
-      start_indices_long.push_back((int64_t)start_index);
-
-    // ID ranges need to be offset by each local partition's
-    // starting indices.
-    raft::spatial::knn::brute_force_knn<std::int64_t, float, std::size_t>(
-      handle,
-      ptrs,
-      sizes,
-      params.idx_desc->N,
-      query,
-      query_size,
-      work.res_I.data(),
-      work.res_D.data(),
-      params.k,
-      params.rowMajorIndex,
-      params.rowMajorQuery,
-      &start_indices_long,
-      raft::distance::DistanceType::L2SqrtExpanded);
-    handle.sync_stream(handle.get_stream());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  } else {  // Or receive from broadcaster
+    requests.resize(1);
+    handle.get_comms().irecv(
+      broadcast, broadcast_size, part_rank, 0, requests.data() + request_idx);
+    ++request_idx;
   }
 
-  /**
-   * This function copies the labels associated to the locally merged indices
-   * from the index partitions to a merged array of labels
-   * @param[out] out merged labels
-   * @param[in] knn_indices merged indices
-   * @param[in] parts unmerged labels in partitions
-   * @param[in] offsets array splitting the partitions making it possible
-   * to identify the origin partition of an nearest neighbor index
-   * @param[in] cur_batch_size current batch size
-   * @param[in] n_parts number of partitions
-   * @param[in] n_labels number of labels to write (batch_size * n_outputs)
-   */
-  template <int TPB_X, typename ind_t, typename out_t>
-  __global__ void copy_label_outputs_from_index_parts_kernel(out_t* out,
-                                                             ind_t* knn_indices,
-                                                             out_t** parts,
-                                                             uint64_t* offsets,
-                                                             size_t cur_batch_size,
-                                                             int n_parts,
-                                                             int n_labels)
-  {
-    uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-    if (i >= n_labels) return;
-    uint64_t nn_idx = knn_indices[i];
-    int part_idx    = 0;
-    for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
-    part_idx        = std::min(std::max(0, part_idx - 1), n_parts - 1);
-    uint64_t offset = nn_idx - offsets[part_idx];
-    out[i]          = parts[part_idx][offset];
+  try {
+    handle.get_comms().waitall(requests.size(), requests.data());
+  } catch (raft::exception& e) {
+    CUML_LOG_DEBUG("FAILURE!");
   }
+}
+
+/*!
+ Perform a local KNN search for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[in] query Pointer to query
+ @param[in] query_size Size of query
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void perform_local_knn(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                       opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                       raft::handle_t& handle,
+                       in_t* query,
+                       size_t query_size)
+{
+  std::vector<in_t*> ptrs(params.idx_data->size());
+  std::vector<std::size_t> sizes(params.idx_data->size());
 
-  /*!
-   Get the right labels for indices obtained after a KNN merge
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[in] batch_size Batch size
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  void copy_label_outputs_from_index_parts(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                                           opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                                           raft::handle_t& handle,
-                                           size_t batch_size)
-  {
-    const int TPB_X = 256;
-    int n_labels    = batch_size * params.k;
-    dim3 grid(raft::ceildiv(n_labels, TPB_X));
-    dim3 blk(TPB_X);
-
-    uint64_t offset = 0;
-    std::vector<uint64_t> offsets_h;
-    for (auto& rsp : work.idxPartsToRanks) {
-      if (rsp->rank == work.my_rank) { offsets_h.push_back(offset); }
-      offset += rsp->size;
-    }
-    std::size_t n_parts = offsets_h.size();
-    rmm::device_uvector<uint64_t> offsets_d(n_parts, handle.get_stream());
-    raft::update_device(offsets_d.data(), offsets_h.data(), n_parts, handle.get_stream());
-
-    std::vector<out_t*> parts_h(n_parts);
-    rmm::device_uvector<out_t*> parts_d(n_parts, handle.get_stream());
-    for (std::size_t o = 0; o < params.n_outputs; o++) {
-      for (std::size_t p = 0; p < n_parts; p++) {
-        parts_h[p] = params.y->at(p)[o];
-      }
-      raft::update_device(parts_d.data(), parts_h.data(), n_parts, handle.get_stream());
+  for (std::size_t cur_idx = 0; cur_idx < params.idx_data->size(); cur_idx++) {
+    ptrs[cur_idx]  = params.idx_data->at(cur_idx)->ptr;
+    sizes[cur_idx] = work.local_idx_parts[cur_idx]->size;
+  }
 
-      copy_label_outputs_from_index_parts_kernel<TPB_X, ind_t, out_t>
-        <<<grid, blk, 0, handle.get_stream()>>>(work.res.data() + (o * n_labels),
-                                                work.res_I.data(),
-                                                parts_d.data(),
-                                                offsets_d.data(),
-                                                batch_size,
-                                                n_parts,
-                                                n_labels);
+  // Offset nearest neighbor index matrix by partition indices
+  std::vector<size_t> start_indices = params.idx_desc->startIndices(work.my_rank);
+  // PartDescriptor uses size_t while FAISS uses int64_t
+  // so we need to do a quick conversion.
+  std::vector<int64_t> start_indices_long;
+  for (size_t start_index : start_indices)
+    start_indices_long.push_back((int64_t)start_index);
+
+  // ID ranges need to be offset by each local partition's
+  // starting indices.
+  raft::spatial::knn::brute_force_knn<std::int64_t, float, std::size_t>(
+    handle,
+    ptrs,
+    sizes,
+    params.idx_desc->N,
+    query,
+    query_size,
+    work.res_I.data(),
+    work.res_D.data(),
+    params.k,
+    params.rowMajorIndex,
+    params.rowMajorQuery,
+    &start_indices_long,
+    raft::distance::DistanceType::L2SqrtExpanded);
+  handle.sync_stream(handle.get_stream());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/**
+ * This function copies the labels associated to the locally merged indices
+ * from the index partitions to a merged array of labels
+ * @param[out] out merged labels
+ * @param[in] knn_indices merged indices
+ * @param[in] parts unmerged labels in partitions
+ * @param[in] offsets array splitting the partitions making it possible
+ * to identify the origin partition of an nearest neighbor index
+ * @param[in] cur_batch_size current batch size
+ * @param[in] n_parts number of partitions
+ * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+ */
+template <int TPB_X, typename ind_t, typename out_t>
+__global__ void copy_label_outputs_from_index_parts_kernel(out_t* out,
+                                                           ind_t* knn_indices,
+                                                           out_t** parts,
+                                                           uint64_t* offsets,
+                                                           size_t cur_batch_size,
+                                                           int n_parts,
+                                                           int n_labels)
+{
+  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (i >= n_labels) return;
+  uint64_t nn_idx = knn_indices[i];
+  int part_idx    = 0;
+  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
+  part_idx        = std::min(std::max(0, part_idx - 1), n_parts - 1);
+  uint64_t offset = nn_idx - offsets[part_idx];
+  out[i]          = parts[part_idx][offset];
+}
+
+/*!
+ Get the right labels for indices obtained after a KNN merge
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void copy_label_outputs_from_index_parts(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                                         opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                                         raft::handle_t& handle,
+                                         size_t batch_size)
+{
+  const int TPB_X = 256;
+  int n_labels    = batch_size * params.k;
+  dim3 grid(raft::ceildiv(n_labels, TPB_X));
+  dim3 blk(TPB_X);
+
+  uint64_t offset = 0;
+  std::vector<uint64_t> offsets_h;
+  for (auto& rsp : work.idxPartsToRanks) {
+    if (rsp->rank == work.my_rank) { offsets_h.push_back(offset); }
+    offset += rsp->size;
+  }
+  std::size_t n_parts = offsets_h.size();
+  rmm::device_uvector<uint64_t> offsets_d(n_parts, handle.get_stream());
+  raft::update_device(offsets_d.data(), offsets_h.data(), n_parts, handle.get_stream());
+
+  std::vector<out_t*> parts_h(n_parts);
+  rmm::device_uvector<out_t*> parts_d(n_parts, handle.get_stream());
+  for (std::size_t o = 0; o < params.n_outputs; o++) {
+    for (std::size_t p = 0; p < n_parts; p++) {
+      parts_h[p] = params.y->at(p)[o];
     }
-    handle.sync_stream(handle.get_stream());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    raft::update_device(parts_d.data(), parts_h.data(), n_parts, handle.get_stream());
+
+    copy_label_outputs_from_index_parts_kernel<TPB_X, ind_t, out_t>
+      <<<grid, blk, 0, handle.get_stream()>>>(work.res.data() + (o * n_labels),
+                                              work.res_I.data(),
+                                              parts_d.data(),
+                                              offsets_d.data(),
+                                              batch_size,
+                                              n_parts,
+                                              n_labels);
   }
+  handle.sync_stream(handle.get_stream());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/*!
+ Exchange results of local KNN search and operation for a given query batch
+ All non-root index ranks send the results for the current
+ query batch to the root rank for the batch.
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[in] part_rank Rank of currently processed query batch
+ @param[in] batch_size Batch size
+ */
+template <typename in_t, typename ind_t, typename dist_t, typename out_t>
+void exchange_results(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                      opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                      raft::handle_t& handle,
+                      int part_rank,
+                      size_t batch_size)
+{
+  size_t batch_elms = batch_size * params.k;
 
-  /*!
-   Exchange results of local KNN search and operation for a given query batch
-   All non-root index ranks send the results for the current
-   query batch to the root rank for the batch.
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[in] part_rank Rank of currently processed query batch
-   @param[in] batch_size Batch size
-   */
-  template <typename in_t, typename ind_t, typename dist_t, typename out_t>
-  void exchange_results(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                        opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                        raft::handle_t& handle,
-                        int part_rank,
-                        size_t batch_size)
-  {
-    size_t batch_elms = batch_size * params.k;
-
-    int request_idx = 0;
-    std::vector<raft::comms::request_t> requests;
-    if (part_rank != work.my_rank) {  // Either send local KNN results
-      requests.resize(2);
-      handle.get_comms().isend(
-        work.res_I.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
-      ++request_idx;
-
-      handle.get_comms().isend(
-        work.res_D.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
-      ++request_idx;
-
-      if (params.knn_op != knn_operation::knn) {
-        requests.resize(2 + params.n_outputs);
-        for (std::size_t o = 0; o < params.n_outputs; o++) {
-          handle.get_comms().isend(work.res.data() + (o * batch_elms),
-                                   batch_elms,
-                                   part_rank,
-                                   0,
-                                   requests.data() + request_idx);
-          ++request_idx;
-        }
-      }
-    } else {  // Or, as the owner of currently processed query batch,
-              // receive results from other workers for reduce
-      bool part_rank_is_idx = work.idxRanks.find(part_rank) != work.idxRanks.end();
-      size_t idx_rank_size  = work.idxRanks.size();
-
-      // if root rank is an index, it will already have
-      // query data, so no need to receive from it.
-      work.res_I.resize(batch_elms * idx_rank_size, handle.get_stream());
-      work.res_D.resize(batch_elms * idx_rank_size, handle.get_stream());
-
-      if (params.knn_op != knn_operation::knn) {
-        work.res.resize(batch_elms * params.n_outputs * idx_rank_size, handle.get_stream());
-      }
+  int request_idx = 0;
+  std::vector<raft::comms::request_t> requests;
+  if (part_rank != work.my_rank) {  // Either send local KNN results
+    requests.resize(2);
+    handle.get_comms().isend(
+      work.res_I.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
+    ++request_idx;
 
-      if (part_rank_is_idx) {
-        /**
-         * If this worker (in charge of reduce),
-         * has some local results as well,
-         * copy them at right location
-         */
-        --idx_rank_size;
-        int i = 0;
-        for (int rank : work.idxRanks) {
-          if (rank == work.my_rank) {
-            size_t batch_offset = batch_elms * i;
-
-            // Indices and distances are stored in rank order
-            raft::copy_async(
-              work.res_I.data() + batch_offset, work.res_I.data(), batch_elms, handle.get_stream());
-            raft::copy_async(
-              work.res_D.data() + batch_offset, work.res_D.data(), batch_elms, handle.get_stream());
-
-            if (params.knn_op != knn_operation::knn) {
-              rmm::device_uvector<out_t> tmp_res(params.n_outputs * batch_elms,
-                                                 handle.get_stream());
-              raft::copy_async(
-                tmp_res.data(), work.res.data(), tmp_res.size(), handle.get_stream());
-
-              for (std::size_t o = 0; o < params.n_outputs; ++o) {
-                // Outputs are stored in target order and then in rank order
-                raft::copy_async(
-                  work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset,
-                  tmp_res.data() + (o * batch_elms),
-                  batch_elms,
-                  handle.get_stream());
-              }
-            }
-            handle.sync_stream(handle.get_stream());
-            break;
-          }
-          i++;
-        }
+    handle.get_comms().isend(
+      work.res_D.data(), batch_elms, part_rank, 0, requests.data() + request_idx);
+    ++request_idx;
+
+    if (params.knn_op != knn_operation::knn) {
+      requests.resize(2 + params.n_outputs);
+      for (std::size_t o = 0; o < params.n_outputs; o++) {
+        handle.get_comms().isend(work.res.data() + (o * batch_elms),
+                                 batch_elms,
+                                 part_rank,
+                                 0,
+                                 requests.data() + request_idx);
+        ++request_idx;
       }
+    }
+  } else {  // Or, as the owner of currently processed query batch,
+    // receive results from other workers for reduce
+    bool part_rank_is_idx = work.idxRanks.find(part_rank) != work.idxRanks.end();
+    size_t idx_rank_size  = work.idxRanks.size();
+
+    // if root rank is an index, it will already have
+    // query data, so no need to receive from it.
+    work.res_I.resize(batch_elms * idx_rank_size, handle.get_stream());
+    work.res_D.resize(batch_elms * idx_rank_size, handle.get_stream());
 
-      size_t request_size = 2 * idx_rank_size;
-      if (params.knn_op != knn_operation::knn)
-        request_size = (2 + params.n_outputs) * idx_rank_size;
-      requests.resize(request_size);
+    if (params.knn_op != knn_operation::knn) {
+      work.res.resize(batch_elms * params.n_outputs * idx_rank_size, handle.get_stream());
+    }
 
-      int num_received = 0;
+    if (part_rank_is_idx) {
+      /**
+       * If this worker (in charge of reduce),
+       * has some local results as well,
+       * copy them at right location
+       */
+      --idx_rank_size;
+      int i = 0;
       for (int rank : work.idxRanks) {
-        if (rank != work.my_rank) {
-          size_t batch_offset = batch_elms * num_received;
+        if (rank == work.my_rank) {
+          size_t batch_offset = batch_elms * i;
 
           // Indices and distances are stored in rank order
-          handle.get_comms().irecv(
-            work.res_I.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
-          ++request_idx;
-          handle.get_comms().irecv(
-            work.res_D.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
-          ++request_idx;
+          raft::copy_async(
+            work.res_I.data() + batch_offset, work.res_I.data(), batch_elms, handle.get_stream());
+          raft::copy_async(
+            work.res_D.data() + batch_offset, work.res_D.data(), batch_elms, handle.get_stream());
 
           if (params.knn_op != knn_operation::knn) {
-            for (std::size_t o = 0; o < params.n_outputs; o++) {
+            rmm::device_uvector<out_t> tmp_res(params.n_outputs * batch_elms, handle.get_stream());
+            raft::copy_async(tmp_res.data(), work.res.data(), tmp_res.size(), handle.get_stream());
+
+            for (std::size_t o = 0; o < params.n_outputs; ++o) {
               // Outputs are stored in target order and then in rank order
-              out_t* r = work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset;
-              handle.get_comms().irecv(r, batch_elms, rank, 0, requests.data() + request_idx);
-              ++request_idx;
+              raft::copy_async(
+                work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset,
+                tmp_res.data() + (o * batch_elms),
+                batch_elms,
+                handle.get_stream());
             }
           }
+          handle.sync_stream(handle.get_stream());
+          break;
         }
-        if (rank != work.my_rank || part_rank_is_idx) {
-          /**
-           * Increase index for each new reception
-           * Also increase index when the worker doing a reduce operation
-           * has some index data (previously copied at right location).
-           */
-          ++num_received;
-        }
+        i++;
       }
     }
 
-    try {
-      handle.get_comms().waitall(requests.size(), requests.data());
-    } catch (raft::exception& e) {
-      CUML_LOG_DEBUG("FAILURE!");
+    size_t request_size = 2 * idx_rank_size;
+    if (params.knn_op != knn_operation::knn) request_size = (2 + params.n_outputs) * idx_rank_size;
+    requests.resize(request_size);
+
+    int num_received = 0;
+    for (int rank : work.idxRanks) {
+      if (rank != work.my_rank) {
+        size_t batch_offset = batch_elms * num_received;
+
+        // Indices and distances are stored in rank order
+        handle.get_comms().irecv(
+          work.res_I.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
+        ++request_idx;
+        handle.get_comms().irecv(
+          work.res_D.data() + batch_offset, batch_elms, rank, 0, requests.data() + request_idx);
+        ++request_idx;
+
+        if (params.knn_op != knn_operation::knn) {
+          for (std::size_t o = 0; o < params.n_outputs; o++) {
+            // Outputs are stored in target order and then in rank order
+            out_t* r = work.res.data() + (o * work.idxRanks.size() * batch_elms) + batch_offset;
+            handle.get_comms().irecv(r, batch_elms, rank, 0, requests.data() + request_idx);
+            ++request_idx;
+          }
+        }
+      }
+      if (rank != work.my_rank || part_rank_is_idx) {
+        /**
+         * Increase index for each new reception
+         * Also increase index when the worker doing a reduce operation
+         * has some index data (previously copied at right location).
+         */
+        ++num_received;
+      }
     }
   }
 
-  /*!
-   Reduce all local results to a global result for a given query batch
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[in] part_idx Partition index of query batch
-   @param[in] processed_in_part Number of queries already processed in part (serves as offset)
-   @param[in] batch_size Batch size
-   */
-  template <typename in_t,
-            typename ind_t,
-            typename dist_t,
-            typename out_t,
-            typename trans_t = int64_t>
-  void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-              opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-              raft::handle_t& handle,
-              int part_idx,
-              size_t processed_in_part,
-              size_t batch_size)
-  {
-    rmm::device_uvector<trans_t> trans(work.idxRanks.size(), handle.get_stream());
-    RAFT_CUDA_TRY(cudaMemsetAsync(
-      trans.data(), 0, work.idxRanks.size() * sizeof(trans_t), handle.get_stream()));
-
-    size_t batch_offset = processed_in_part * params.k;
-
-    ind_t* indices    = nullptr;
-    dist_t* distances = nullptr;
-
-    rmm::device_uvector<ind_t> indices_b(0, handle.get_stream());
-    rmm::device_uvector<dist_t> distances_b(0, handle.get_stream());
+  try {
+    handle.get_comms().waitall(requests.size(), requests.data());
+  } catch (raft::exception& e) {
+    CUML_LOG_DEBUG("FAILURE!");
+  }
+}
+
+/*!
+ Reduce all local results to a global result for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[in] part_idx Partition index of query batch
+ @param[in] processed_in_part Number of queries already processed in part (serves as offset)
+ @param[in] batch_size Batch size
+ */
+template <typename in_t,
+          typename ind_t,
+          typename dist_t,
+          typename out_t,
+          typename trans_t = int64_t>
+void reduce(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+            opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+            raft::handle_t& handle,
+            int part_idx,
+            size_t processed_in_part,
+            size_t batch_size)
+{
+  rmm::device_uvector<trans_t> trans(work.idxRanks.size(), handle.get_stream());
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(trans.data(), 0, work.idxRanks.size() * sizeof(trans_t), handle.get_stream()));
+
+  size_t batch_offset = processed_in_part * params.k;
+
+  ind_t* indices    = nullptr;
+  dist_t* distances = nullptr;
+
+  rmm::device_uvector<ind_t> indices_b(0, handle.get_stream());
+  rmm::device_uvector<dist_t> distances_b(0, handle.get_stream());
+
+  if (params.knn_op == knn_operation::knn) {
+    indices   = params.out_I->at(part_idx)->ptr + batch_offset;
+    distances = params.out_D->at(part_idx)->ptr + batch_offset;
+  } else {
+    indices_b.resize(batch_size * params.k, handle.get_stream());
+    distances_b.resize(batch_size * params.k, handle.get_stream());
+    indices   = indices_b.data();
+    distances = distances_b.data();
+  }
 
-    if (params.knn_op == knn_operation::knn) {
-      indices   = params.out_I->at(part_idx)->ptr + batch_offset;
-      distances = params.out_D->at(part_idx)->ptr + batch_offset;
+  // Merge all KNN local results
+  raft::spatial::knn::knn_merge_parts(work.res_D.data(),
+                                      work.res_I.data(),
+                                      distances,
+                                      indices,
+                                      batch_size,
+                                      work.idxRanks.size(),
+                                      params.k,
+                                      handle.get_stream(),
+                                      trans.data());
+  handle.sync_stream(handle.get_stream());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  if (params.knn_op != knn_operation::knn) {
+    rmm::device_uvector<out_t> merged_outputs_b(params.n_outputs * batch_size * params.k,
+                                                handle.get_stream());
+    // Get the right labels for indices obtained after local KNN searches
+    merge_labels(params,
+                 work,
+                 handle,
+                 merged_outputs_b.data(),
+                 indices,
+                 work.res.data(),
+                 work.res_I.data(),
+                 batch_size);
+
+    out_t* outputs = nullptr;
+    std::vector<float*> probas_with_offsets;
+
+    if (params.knn_op != knn_operation::class_proba) {
+      outputs = params.out->at(part_idx)->ptr + (processed_in_part * params.n_outputs);
     } else {
-      indices_b.resize(batch_size * params.k, handle.get_stream());
-      distances_b.resize(batch_size * params.k, handle.get_stream());
-      indices   = indices_b.data();
-      distances = distances_b.data();
+      std::vector<float*>& probas_part = params.probas->at(part_idx);
+      for (std::size_t i = 0; i < params.n_outputs; i++) {
+        float* ptr           = probas_part[i];
+        int n_unique_classes = params.n_unique->at(i);
+        probas_with_offsets.push_back(ptr + (processed_in_part * n_unique_classes));
+      }
     }
 
-    // Merge all KNN local results
-    raft::spatial::knn::knn_merge_parts(work.res_D.data(),
-                                        work.res_I.data(),
-                                        distances,
-                                        indices,
-                                        batch_size,
-                                        work.idxRanks.size(),
-                                        params.k,
-                                        handle.get_stream(),
-                                        trans.data());
+    // Perform final classification, regression or class-proba operation
+    perform_local_operation(
+      params, work, handle, outputs, probas_with_offsets, merged_outputs_b.data(), batch_size);
+
     handle.sync_stream(handle.get_stream());
     RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    if (params.knn_op != knn_operation::knn) {
-      rmm::device_uvector<out_t> merged_outputs_b(params.n_outputs * batch_size * params.k,
-                                                  handle.get_stream());
-      // Get the right labels for indices obtained after local KNN searches
-      merge_labels(params,
-                   work,
-                   handle,
-                   merged_outputs_b.data(),
-                   indices,
-                   work.res.data(),
-                   work.res_I.data(),
-                   batch_size);
-
-      out_t* outputs = nullptr;
-      std::vector<float*> probas_with_offsets;
-
-      if (params.knn_op != knn_operation::class_proba) {
-        outputs = params.out->at(part_idx)->ptr + (processed_in_part * params.n_outputs);
-      } else {
-        std::vector<float*>& probas_part = params.probas->at(part_idx);
-        for (std::size_t i = 0; i < params.n_outputs; i++) {
-          float* ptr           = probas_part[i];
-          int n_unique_classes = params.n_unique->at(i);
-          probas_with_offsets.push_back(ptr + (processed_in_part * n_unique_classes));
-        }
-      }
-
-      // Perform final classification, regression or class-proba operation
-      perform_local_operation(
-        params, work, handle, outputs, probas_with_offsets, merged_outputs_b.data(), batch_size);
-
-      handle.sync_stream(handle.get_stream());
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-    }
   }
-
-  /**
-   * This function copies the labels associated to the merged indices
-   * from the unmerged to a merged (n_ranks times smaller) array of labels
-   * @param[out] outputs merged labels
-   * @param[in] knn_indices merged indices
-   * @param[in] unmerged_outputs unmerged labels
-   * @param[in] unmerged_knn_indices unmerged indices
-   * @param[in] offsets array splitting the partitions making it possible
-   * to identify the origin partition of an nearest neighbor index
-   * @param[in] parts_to_ranks get rank index from index partition index,
-   * informative to find positions as the unmerged arrays are built
-   * so that ranks are in order (unlike partitions)
-   * @param[in] nearest_neighbors number of nearest neighbors to look for in query
-   * @param[in] n_outputs number of targets
-   * @param[in] n_labels number of labels to write (batch_size * n_outputs)
-   * @param[in] n_parts number of index partitions
-   * @param[in] n_ranks number of index ranks
-   */
-  template <int TPB_X, typename dist_t, typename out_t>
-  __global__ void merge_labels_kernel(out_t* outputs,
-                                      dist_t* knn_indices,
-                                      out_t* unmerged_outputs,
-                                      dist_t* unmerged_knn_indices,
-                                      size_t* offsets,
-                                      int* parts_to_ranks,
-                                      int nearest_neighbors,
-                                      int n_outputs,
-                                      int n_labels,
-                                      int n_parts,
-                                      int n_ranks)
-  {
-    uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
-    if (i >= n_labels) return;
-    uint64_t nn_idx = knn_indices[i];
-    int part_idx    = 0;
-    for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
-    part_idx         = std::min(std::max(0, part_idx - 1), n_parts - 1);
-    int rank_idx     = parts_to_ranks[part_idx];
-    int inbatch_idx  = i / nearest_neighbors;
-    uint64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
-    for (int k = 0; k < nearest_neighbors; k++) {
-      if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
-        for (int o = 0; o < n_outputs; o++) {
-          outputs[(o * n_labels) + i] = unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
-        }
-        return;
+}
+
+/**
+ * This function copies the labels associated to the merged indices
+ * from the unmerged to a merged (n_ranks times smaller) array of labels
+ * @param[out] outputs merged labels
+ * @param[in] knn_indices merged indices
+ * @param[in] unmerged_outputs unmerged labels
+ * @param[in] unmerged_knn_indices unmerged indices
+ * @param[in] offsets array splitting the partitions making it possible
+ * to identify the origin partition of an nearest neighbor index
+ * @param[in] parts_to_ranks get rank index from index partition index,
+ * informative to find positions as the unmerged arrays are built
+ * so that ranks are in order (unlike partitions)
+ * @param[in] nearest_neighbors number of nearest neighbors to look for in query
+ * @param[in] n_outputs number of targets
+ * @param[in] n_labels number of labels to write (batch_size * n_outputs)
+ * @param[in] n_parts number of index partitions
+ * @param[in] n_ranks number of index ranks
+ */
+template <int TPB_X, typename dist_t, typename out_t>
+__global__ void merge_labels_kernel(out_t* outputs,
+                                    dist_t* knn_indices,
+                                    out_t* unmerged_outputs,
+                                    dist_t* unmerged_knn_indices,
+                                    size_t* offsets,
+                                    int* parts_to_ranks,
+                                    int nearest_neighbors,
+                                    int n_outputs,
+                                    int n_labels,
+                                    int n_parts,
+                                    int n_ranks)
+{
+  uint64_t i = (blockIdx.x * TPB_X) + threadIdx.x;
+  if (i >= n_labels) return;
+  uint64_t nn_idx = knn_indices[i];
+  int part_idx    = 0;
+  for (; part_idx < n_parts && nn_idx >= offsets[part_idx]; part_idx++) {}
+  part_idx         = std::min(std::max(0, part_idx - 1), n_parts - 1);
+  int rank_idx     = parts_to_ranks[part_idx];
+  int inbatch_idx  = i / nearest_neighbors;
+  uint64_t elm_idx = (rank_idx * n_labels) + inbatch_idx * nearest_neighbors;
+  for (int k = 0; k < nearest_neighbors; k++) {
+    if (nn_idx == unmerged_knn_indices[elm_idx + k]) {
+      for (int o = 0; o < n_outputs; o++) {
+        outputs[(o * n_labels) + i] = unmerged_outputs[(o * n_ranks * n_labels) + elm_idx + k];
       }
+      return;
     }
   }
-
-  /*!
-   Get the right labels for indices obtained after local KNN searches
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[out] output KNN outputs output array
-   @param[out] knn_indices KNN class-probas output array (class-proba only)
-   @param[in] unmerged_outputs KNN labels input array
-   @param[in] unmerged_knn_indices Batch size
-   @param[in] batch_size Batch size
-   */
-  template <typename opg_knn_param_t, typename opg_knn_work_t, typename ind_t, typename out_t>
-  void merge_labels(opg_knn_param_t& params,
-                    opg_knn_work_t& work,
-                    raft::handle_t& handle,
-                    out_t* output,
-                    ind_t* knn_indices,
-                    out_t* unmerged_outputs,
-                    ind_t* unmerged_knn_indices,
-                    int batch_size)
-  {
-    const int TPB_X = 256;
-    int n_labels    = batch_size * params.k;
-    dim3 grid(raft::ceildiv(n_labels, TPB_X));
-    dim3 blk(TPB_X);
-
-    int offset = 0;
-    std::vector<uint64_t> offsets_h;
-    for (auto& rsp : work.idxPartsToRanks) {
-      offsets_h.push_back(offset);
-      offset += rsp->size;
-    }
-    rmm::device_uvector<uint64_t> offsets_d(offsets_h.size(), handle.get_stream());
-    raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(), handle.get_stream());
-
-    std::vector<int> parts_to_ranks_h;
-    for (auto& rsp : work.idxPartsToRanks) {
-      int i = 0;
-      for (int rank : work.idxRanks) {
-        if (rank == rsp->rank) { parts_to_ranks_h.push_back(i); }
-        ++i;
-      }
-    }
-    rmm::device_uvector<int> parts_to_ranks_d(parts_to_ranks_h.size(), handle.get_stream());
-    raft::update_device(parts_to_ranks_d.data(),
-                        parts_to_ranks_h.data(),
-                        parts_to_ranks_h.size(),
-                        handle.get_stream());
-
-    merge_labels_kernel<TPB_X><<<grid, blk, 0, handle.get_stream()>>>(output,
-                                                                      knn_indices,
-                                                                      unmerged_outputs,
-                                                                      unmerged_knn_indices,
-                                                                      offsets_d.data(),
-                                                                      parts_to_ranks_d.data(),
-                                                                      params.k,
-                                                                      params.n_outputs,
-                                                                      n_labels,
-                                                                      work.idxPartsToRanks.size(),
-                                                                      work.idxRanks.size());
+}
+
+/*!
+ Get the right labels for indices obtained after local KNN searches
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[out] output KNN outputs output array
+ @param[out] knn_indices KNN class-probas output array (class-proba only)
+ @param[in] unmerged_outputs KNN labels input array
+ @param[in] unmerged_knn_indices Batch size
+ @param[in] batch_size Batch size
+ */
+template <typename opg_knn_param_t, typename opg_knn_work_t, typename ind_t, typename out_t>
+void merge_labels(opg_knn_param_t& params,
+                  opg_knn_work_t& work,
+                  raft::handle_t& handle,
+                  out_t* output,
+                  ind_t* knn_indices,
+                  out_t* unmerged_outputs,
+                  ind_t* unmerged_knn_indices,
+                  int batch_size)
+{
+  const int TPB_X = 256;
+  int n_labels    = batch_size * params.k;
+  dim3 grid(raft::ceildiv(n_labels, TPB_X));
+  dim3 blk(TPB_X);
+
+  int offset = 0;
+  std::vector<uint64_t> offsets_h;
+  for (auto& rsp : work.idxPartsToRanks) {
+    offsets_h.push_back(offset);
+    offset += rsp->size;
   }
-
-  /*!
-   Perform final classification, regression or class-proba operation for a given query batch
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[out] outputs KNN outputs output array
-   @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
-   @param[in] labels KNN labels input array
-   @param[in] batch_size Batch size
-   */
-  template <typename in_t,
-            typename ind_t,
-            typename dist_t,
-            typename out_t,
-            typename std::enable_if<std::is_floating_point<out_t>::value>::type* = nullptr>
-  void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                               opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                               raft::handle_t& handle,
-                               out_t* outputs,
-                               std::vector<float*>& probas_with_offsets,
-                               out_t* labels,
-                               size_t batch_size)
-  {
-    size_t n_labels = batch_size * params.k;
-    std::vector<out_t*> y(params.n_outputs);
-    for (std::size_t o = 0; o < params.n_outputs; o++) {
-      y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+  rmm::device_uvector<uint64_t> offsets_d(offsets_h.size(), handle.get_stream());
+  raft::update_device(offsets_d.data(), offsets_h.data(), offsets_h.size(), handle.get_stream());
+
+  std::vector<int> parts_to_ranks_h;
+  for (auto& rsp : work.idxPartsToRanks) {
+    int i = 0;
+    for (int rank : work.idxRanks) {
+      if (rank == rsp->rank) { parts_to_ranks_h.push_back(i); }
+      ++i;
     }
-
-    MLCommon::Selection::knn_regress<float, 32, true>(
-      handle, outputs, nullptr, y, n_labels, batch_size, params.k);
+  }
+  rmm::device_uvector<int> parts_to_ranks_d(parts_to_ranks_h.size(), handle.get_stream());
+  raft::update_device(
+    parts_to_ranks_d.data(), parts_to_ranks_h.data(), parts_to_ranks_h.size(), handle.get_stream());
+
+  merge_labels_kernel<TPB_X><<<grid, blk, 0, handle.get_stream()>>>(output,
+                                                                    knn_indices,
+                                                                    unmerged_outputs,
+                                                                    unmerged_knn_indices,
+                                                                    offsets_d.data(),
+                                                                    parts_to_ranks_d.data(),
+                                                                    params.k,
+                                                                    params.n_outputs,
+                                                                    n_labels,
+                                                                    work.idxPartsToRanks.size(),
+                                                                    work.idxRanks.size());
+}
+
+/*!
+ Perform final classification, regression or class-proba operation for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[out] outputs KNN outputs output array
+ @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+ @param[in] labels KNN labels input array
+ @param[in] batch_size Batch size
+ */
+template <typename in_t,
+          typename ind_t,
+          typename dist_t,
+          typename out_t,
+          typename std::enable_if<std::is_floating_point<out_t>::value>::type* = nullptr>
+void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                             opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                             raft::handle_t& handle,
+                             out_t* outputs,
+                             std::vector<float*>& probas_with_offsets,
+                             out_t* labels,
+                             size_t batch_size)
+{
+  size_t n_labels = batch_size * params.k;
+  std::vector<out_t*> y(params.n_outputs);
+  for (std::size_t o = 0; o < params.n_outputs; o++) {
+    y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
   }
 
-  /*!
-   Perform final classification, regression or class-proba operation for a given query batch
-   @param[in] params Parameters for distrbuted KNN operation
-   @param[in] work Current work for distributed KNN
-   @param[in] handle RAFT handle
-   @param[out] outputs KNN outputs output array
-   @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
-   @param[in] labels KNN labels input array
-   @param[in] batch_size Batch size
-   */
-  template <typename in_t,
-            typename ind_t,
-            typename dist_t,
-            typename out_t,
-            typename std::enable_if<std::is_integral<out_t>::value>::type* = nullptr>
-  void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
-                               opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
-                               raft::handle_t& handle,
-                               out_t* outputs,
-                               std::vector<float*>& probas_with_offsets,
-                               out_t* labels,
-                               size_t batch_size)
-  {
-    size_t n_labels = batch_size * params.k;
-    std::vector<out_t*> y(params.n_outputs);
-    for (std::size_t o = 0; o < params.n_outputs; o++) {
-      y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
-    }
+  MLCommon::Selection::knn_regress<float, 32, true>(
+    handle, outputs, nullptr, y, n_labels, batch_size, params.k);
+}
+
+/*!
+ Perform final classification, regression or class-proba operation for a given query batch
+ @param[in] params Parameters for distrbuted KNN operation
+ @param[in] work Current work for distributed KNN
+ @param[in] handle RAFT handle
+ @param[out] outputs KNN outputs output array
+ @param[out] probas_with_offsets KNN class-probas output array (class-proba only)
+ @param[in] labels KNN labels input array
+ @param[in] batch_size Batch size
+ */
+template <typename in_t,
+          typename ind_t,
+          typename dist_t,
+          typename out_t,
+          typename std::enable_if<std::is_integral<out_t>::value>::type* = nullptr>
+void perform_local_operation(opg_knn_param<in_t, ind_t, dist_t, out_t>& params,
+                             opg_knn_work<in_t, ind_t, dist_t, out_t>& work,
+                             raft::handle_t& handle,
+                             out_t* outputs,
+                             std::vector<float*>& probas_with_offsets,
+                             out_t* labels,
+                             size_t batch_size)
+{
+  size_t n_labels = batch_size * params.k;
+  std::vector<out_t*> y(params.n_outputs);
+  for (std::size_t o = 0; o < params.n_outputs; o++) {
+    y[o] = reinterpret_cast<out_t*>(labels) + (o * n_labels);
+  }
 
-    switch (params.knn_op) {
-      case knn_operation::classification:
-        MLCommon::Selection::knn_classify<32, true>(handle,
-                                                    outputs,
-                                                    nullptr,
-                                                    y,
-                                                    n_labels,
-                                                    batch_size,
-                                                    params.k,
-                                                    *(params.uniq_labels),
-                                                    *(params.n_unique));
-        break;
-      case knn_operation::class_proba:
-        MLCommon::Selection::class_probs<32, true>(handle,
-                                                   probas_with_offsets,
-                                                   nullptr,
-                                                   y,
-                                                   n_labels,
-                                                   batch_size,
-                                                   params.k,
-                                                   *(params.uniq_labels),
-                                                   *(params.n_unique));
-        break;
-      default: CUML_LOG_DEBUG("FAILURE!");
-    }
+  switch (params.knn_op) {
+    case knn_operation::classification:
+      MLCommon::Selection::knn_classify<32, true>(handle,
+                                                  outputs,
+                                                  nullptr,
+                                                  y,
+                                                  n_labels,
+                                                  batch_size,
+                                                  params.k,
+                                                  *(params.uniq_labels),
+                                                  *(params.n_unique));
+      break;
+    case knn_operation::class_proba:
+      MLCommon::Selection::class_probs<32, true>(handle,
+                                                 probas_with_offsets,
+                                                 nullptr,
+                                                 y,
+                                                 n_labels,
+                                                 batch_size,
+                                                 params.k,
+                                                 *(params.uniq_labels),
+                                                 *(params.n_unique));
+      break;
+    default: CUML_LOG_DEBUG("FAILURE!");
   }
+}
 
-  };  // namespace knn_common
-  };  // namespace opg
-  };  // namespace KNN
-};    // namespace ML
+};  // namespace knn_common
+};  // namespace opg
+};  // namespace KNN
+};  // namespace ML
\ No newline at end of file
diff --git a/cpp/src/knn/knn_sparse.cu b/cpp/src/knn/knn_sparse.cu
index 00e675b226..88f7871f39 100644
--- a/cpp/src/knn/knn_sparse.cu
+++ b/cpp/src/knn/knn_sparse.cu
@@ -17,60 +17,55 @@
 #include <cuml/common/logger.hpp>
 #include <cuml/neighbors/knn_sparse.hpp>
 
-<<<<<<< HEAD
-#include <raft/sparse/selection/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-=======
 #include <raft/sparse/selection/knn.cuh>
->>>>>>> branch-22.10
+#include <raft/spatial/knn/specializations.hpp>
 
 #include <cusparse_v2.h>
 
-namespace ML
-{
-  namespace Sparse {
+namespace ML {
+namespace Sparse {
 
-  void brute_force_knn(raft::handle_t& handle,
-                       const int* idx_indptr,
-                       const int* idx_indices,
-                       const float* idx_data,
-                       size_t idx_nnz,
-                       int n_idx_rows,
-                       int n_idx_cols,
-                       const int* query_indptr,
-                       const int* query_indices,
-                       const float* query_data,
-                       size_t query_nnz,
-                       int n_query_rows,
-                       int n_query_cols,
-                       int* output_indices,
-                       float* output_dists,
-                       int k,
-                       size_t batch_size_index,  // approx 1M
-                       size_t batch_size_query,
-                       raft::distance::DistanceType metric,
-                       float metricArg)
-  {
-    raft::sparse::selection::brute_force_knn(idx_indptr,
-                                             idx_indices,
-                                             idx_data,
-                                             idx_nnz,
-                                             n_idx_rows,
-                                             n_idx_cols,
-                                             query_indptr,
-                                             query_indices,
-                                             query_data,
-                                             query_nnz,
-                                             n_query_rows,
-                                             n_query_cols,
-                                             output_indices,
-                                             output_dists,
-                                             k,
-                                             handle,
-                                             batch_size_index,
-                                             batch_size_query,
-                                             metric,
-                                             metricArg);
-  }
-  };  // namespace Sparse
-};    // namespace ML
+void brute_force_knn(raft::handle_t& handle,
+                     const int* idx_indptr,
+                     const int* idx_indices,
+                     const float* idx_data,
+                     size_t idx_nnz,
+                     int n_idx_rows,
+                     int n_idx_cols,
+                     const int* query_indptr,
+                     const int* query_indices,
+                     const float* query_data,
+                     size_t query_nnz,
+                     int n_query_rows,
+                     int n_query_cols,
+                     int* output_indices,
+                     float* output_dists,
+                     int k,
+                     size_t batch_size_index,  // approx 1M
+                     size_t batch_size_query,
+                     raft::distance::DistanceType metric,
+                     float metricArg)
+{
+  raft::sparse::selection::brute_force_knn(idx_indptr,
+                                           idx_indices,
+                                           idx_data,
+                                           idx_nnz,
+                                           n_idx_rows,
+                                           n_idx_cols,
+                                           query_indptr,
+                                           query_indices,
+                                           query_data,
+                                           query_nnz,
+                                           n_query_rows,
+                                           n_query_cols,
+                                           output_indices,
+                                           output_dists,
+                                           k,
+                                           handle,
+                                           batch_size_index,
+                                           batch_size_query,
+                                           metric,
+                                           metricArg);
+}
+};  // namespace Sparse
+};  // namespace ML
diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu
index 6f392bd076..9fb2512fd2 100644
--- a/cpp/src/metrics/trustworthiness.cu
+++ b/cpp/src/metrics/trustworthiness.cu
@@ -18,58 +18,58 @@
 
 #include <cuml/metrics/metrics.hpp>
 
-<<<<<<< HEAD
-#include <raft/distance/distance.hpp>
+#if defined RAFT_DISTANCE_COMPILED
 #include <raft/distance/specializations.hpp>
+#endif
+
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
-=======
+#endif
+
 #include <raft/distance/distance.cuh>
-#include <raft/distance/specializations.cuh>
->>>>>>> branch-22.10
 
 #include <raft/core/handle.hpp>
 
-namespace ML
-{
-  namespace Metrics {
+namespace ML {
+namespace Metrics {
 
-  /**
-   * @brief Compute the trustworthiness score
-   *
-   * @param h Raft handle
-   * @param X Data in original dimension
-   * @param X_embedded Data in target dimension (embedding)
-   * @param n Number of samples
-   * @param m Number of features in high/original dimension
-   * @param d Number of features in low/embedded dimension
-   * @param n_neighbors Number of neighbors considered by trustworthiness score
-   * @param batchSize Batch size
-   * @tparam distance_type: Distance type to consider
-   * @return Trustworthiness score
-   */
-  template <typename math_t, raft::distance::DistanceType distance_type>
-  double trustworthiness_score(const raft::handle_t& h,
-                               const math_t* X,
-                               math_t* X_embedded,
-                               int n,
-                               int m,
-                               int d,
-                               int n_neighbors,
-                               int batchSize)
-  {
-    return raft::stats::trustworthiness_score<math_t, distance_type>(
-      h, X, X_embedded, n, m, d, n_neighbors, batchSize);
-  }
+/**
+ * @brief Compute the trustworthiness score
+ *
+ * @param h Raft handle
+ * @param X Data in original dimension
+ * @param X_embedded Data in target dimension (embedding)
+ * @param n Number of samples
+ * @param m Number of features in high/original dimension
+ * @param d Number of features in low/embedded dimension
+ * @param n_neighbors Number of neighbors considered by trustworthiness score
+ * @param batchSize Batch size
+ * @tparam distance_type: Distance type to consider
+ * @return Trustworthiness score
+ */
+template <typename math_t, raft::distance::DistanceType distance_type>
+double trustworthiness_score(const raft::handle_t& h,
+                             const math_t* X,
+                             math_t* X_embedded,
+                             int n,
+                             int m,
+                             int d,
+                             int n_neighbors,
+                             int batchSize)
+{
+  return raft::stats::trustworthiness_score<math_t, distance_type>(
+    h, X, X_embedded, n, m, d, n_neighbors, batchSize);
+}
 
-  template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
-    const raft::handle_t& h,
-    const float* X,
-    float* X_embedded,
-    int n,
-    int m,
-    int d,
-    int n_neighbors,
-    int batchSize);
+template double trustworthiness_score<float, raft::distance::DistanceType::L2SqrtUnexpanded>(
+  const raft::handle_t& h,
+  const float* X,
+  float* X_embedded,
+  int n,
+  int m,
+  int d,
+  int n_neighbors,
+  int batchSize);
 
-  };  // end namespace Metrics
-};    // end namespace ML
+};  // end namespace Metrics
+};  // end namespace ML
diff --git a/cpp/src/metrics/v_measure.cu b/cpp/src/metrics/v_measure.cu
index e95499e811..75de521971 100644
--- a/cpp/src/metrics/v_measure.cu
+++ b/cpp/src/metrics/v_measure.cu
@@ -16,7 +16,7 @@
  */
 
 #include <cuml/metrics/metrics.hpp>
-#include <raft/stats/v_measure.hpp>
+#include <raft/stats/v_measure.cuh>
 
 namespace ML {
 
@@ -30,13 +30,8 @@ double v_measure(const raft::handle_t& handle,
                  const int upper_class_range,
                  double beta)
 {
-<<<<<<< HEAD
   return raft::stats::v_measure(
     y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
-=======
-  return MLCommon::Metrics::v_measure(
-    y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream(), beta);
->>>>>>> branch-22.10
 }
 }  // namespace Metrics
 }  // namespace ML
diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh
index 2e68a83b9f..ebcce3b224 100644
--- a/cpp/src/randomforest/randomforest.cuh
+++ b/cpp/src/randomforest/randomforest.cuh
@@ -20,18 +20,13 @@
 #include <decisiontree/decisiontree.cuh>
 #include <decisiontree/treelite_util.h>
 
-<<<<<<< HEAD
 #include <raft/random/permute.hpp>
-=======
-#include <metrics/scores.cuh>
 
-#include <raft/core/cudart_utils.hpp>
 #include <raft/core/nvtx.hpp>
-#include <raft/random/permute.cuh>
->>>>>>> branch-22.10
 #include <raft/random/rng.cuh>
 #include <raft/stats/accuracy.hpp>
 #include <raft/stats/regression_metrics.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/sequence.h>
@@ -45,277 +40,275 @@
 
 #include <map>
 
-namespace ML
-{
-  template <class T, class L>
-  class RandomForest {
-   protected:
-    RF_params rf_params;  // structure containing RF hyperparameters
-    int rf_type;          // 0 for classification 1 for regression
-
-    void get_row_sample(int tree_id,
-                        int n_rows,
-                        rmm::device_uvector<int>* selected_rows,
-                        const cudaStream_t stream)
-    {
-      raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh");
-
-      // Hash these together so they are uncorrelated
-      auto rs = DT::fnv1a32_basis;
-      rs      = DT::fnv1a32(rs, rf_params.seed);
-      rs      = DT::fnv1a32(rs, tree_id);
-      raft::random::Rng rng(rs, raft::random::GenPhilox);
-      if (rf_params.bootstrap) {
-        // Use bootstrapped sample set
-        rng.uniformInt<int>(selected_rows->data(), selected_rows->size(), 0, n_rows, stream);
-
-      } else {
-        // Use all the samples from the dataset
-        thrust::sequence(
-          thrust::cuda::par.on(stream), selected_rows->begin(), selected_rows->end());
-      }
+namespace ML {
+template <class T, class L>
+class RandomForest {
+ protected:
+  RF_params rf_params;  // structure containing RF hyperparameters
+  int rf_type;          // 0 for classification 1 for regression
+
+  void get_row_sample(int tree_id,
+                      int n_rows,
+                      rmm::device_uvector<int>* selected_rows,
+                      const cudaStream_t stream)
+  {
+    raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh");
+
+    // Hash these together so they are uncorrelated
+    auto rs = DT::fnv1a32_basis;
+    rs      = DT::fnv1a32(rs, rf_params.seed);
+    rs      = DT::fnv1a32(rs, tree_id);
+    raft::random::Rng rng(rs, raft::random::GenPhilox);
+    if (rf_params.bootstrap) {
+      // Use bootstrapped sample set
+      rng.uniformInt<int>(selected_rows->data(), selected_rows->size(), 0, n_rows, stream);
+
+    } else {
+      // Use all the samples from the dataset
+      thrust::sequence(thrust::cuda::par.on(stream), selected_rows->begin(), selected_rows->end());
     }
+  }
 
-    void error_checking(const T* input, L* predictions, int n_rows, int n_cols, bool predict) const
-    {
-      if (predict) {
-        ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions.");
-      }
-      ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows);
-      ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols);
+  void error_checking(const T* input, L* predictions, int n_rows, int n_cols, bool predict) const
+  {
+    if (predict) {
+      ASSERT(predictions != nullptr, "Error! User has not allocated memory for predictions.");
+    }
+    ASSERT((n_rows > 0), "Invalid n_rows %d", n_rows);
+    ASSERT((n_cols > 0), "Invalid n_cols %d", n_cols);
 
-      bool input_is_dev_ptr = DT::is_dev_ptr(input);
-      bool preds_is_dev_ptr = DT::is_dev_ptr(predictions);
+    bool input_is_dev_ptr = DT::is_dev_ptr(input);
+    bool preds_is_dev_ptr = DT::is_dev_ptr(predictions);
 
-      if (!input_is_dev_ptr || (input_is_dev_ptr != preds_is_dev_ptr)) {
-        ASSERT(false,
-               "RF Error: Expected both input and labels/predictions to be GPU "
-               "pointers");
-      }
+    if (!input_is_dev_ptr || (input_is_dev_ptr != preds_is_dev_ptr)) {
+      ASSERT(false,
+             "RF Error: Expected both input and labels/predictions to be GPU "
+             "pointers");
     }
-
-   public:
-    /**
-     * @brief Construct RandomForest object.
-     * @param[in] cfg_rf_params: Random forest hyper-parameter struct.
-     * @param[in] cfg_rf_type: Task type: 0 for classification, 1 for regression
-     */
-    RandomForest(RF_params cfg_rf_params, int cfg_rf_type = RF_type::CLASSIFICATION)
-      : rf_params(cfg_rf_params), rf_type(cfg_rf_type){};
-
-    /**
-     * @brief Build (i.e., fit, train) random forest for input data.
-     * @param[in] user_handle: raft::handle_t
-     * @param[in] input: train data (n_rows samples, n_cols features) in column major format,
-     *   excluding labels. Device pointer.
-     * @param[in] n_rows: number of training data samples.
-     * @param[in] n_cols: number of features (i.e., columns) excluding target feature.
-     * @param[in] labels: 1D array of target predictions/labels. Device Pointer.
-              For classification task, only labels of type int are supported.
-                Assumption: labels were preprocessed to map to ascending numbers from 0;
-                needed for current gini impl in decision tree
-              For regression task, the labels (predictions) can be float or double data type.
-    * @param[in] n_unique_labels: (meaningful only for classification) #unique label values (known
-    during preprocessing)
-    * @param[in] forest: CPU point to RandomForestMetaData struct.
-    */
-    void fit(const raft::handle_t& user_handle,
-             const T* input,
-             int n_rows,
-             int n_cols,
-             L* labels,
-             int n_unique_labels,
-             RandomForestMetaData<T, L>*& forest)
-    {
-      raft::common::nvtx::range fun_scope("RandomForest::fit @randomforest.cuh");
-      this->error_checking(input, labels, n_rows, n_cols, false);
-      const raft::handle_t& handle = user_handle;
-      int n_sampled_rows           = 0;
-      if (this->rf_params.bootstrap) {
-        n_sampled_rows = std::round(this->rf_params.max_samples * n_rows);
-      } else {
-        if (this->rf_params.max_samples != 1.0) {
-          CUML_LOG_WARN(
-            "If bootstrap sampling is disabled, max_samples value is ignored and "
-            "whole dataset is used for building each tree");
-          this->rf_params.max_samples = 1.0;
-        }
-        n_sampled_rows = n_rows;
-      }
-      int n_streams = this->rf_params.n_streams;
-      ASSERT(static_cast<std::size_t>(n_streams) <= handle.get_stream_pool_size(),
-             "rf_params.n_streams (=%d) should be <= raft::handle_t.n_streams (=%lu)",
-             n_streams,
-             handle.get_stream_pool_size());
-
-      // computing the quantiles: last two return values are shared pointers to device memory
-      // encapsulated by quantiles struct
-      auto [quantiles, quantiles_array, n_bins_array] =
-        DT::computeQuantiles(handle, input, this->rf_params.tree_params.max_n_bins, n_rows, n_cols);
-
-      // n_streams should not be less than n_trees
-      if (this->rf_params.n_trees < n_streams) n_streams = this->rf_params.n_trees;
-
-      // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree.
-      // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device
-      // ptr.
-      // Use a deque instead of vector because it can be used on objects with a deleted copy
-      // constructor
-      std::deque<rmm::device_uvector<int>> selected_rows;
-      for (int i = 0; i < n_streams; i++) {
-        selected_rows.emplace_back(n_sampled_rows, handle.get_stream_from_stream_pool(i));
+  }
+
+ public:
+  /**
+   * @brief Construct RandomForest object.
+   * @param[in] cfg_rf_params: Random forest hyper-parameter struct.
+   * @param[in] cfg_rf_type: Task type: 0 for classification, 1 for regression
+   */
+  RandomForest(RF_params cfg_rf_params, int cfg_rf_type = RF_type::CLASSIFICATION)
+    : rf_params(cfg_rf_params), rf_type(cfg_rf_type){};
+
+  /**
+   * @brief Build (i.e., fit, train) random forest for input data.
+   * @param[in] user_handle: raft::handle_t
+   * @param[in] input: train data (n_rows samples, n_cols features) in column major format,
+   *   excluding labels. Device pointer.
+   * @param[in] n_rows: number of training data samples.
+   * @param[in] n_cols: number of features (i.e., columns) excluding target feature.
+   * @param[in] labels: 1D array of target predictions/labels. Device Pointer.
+            For classification task, only labels of type int are supported.
+              Assumption: labels were preprocessed to map to ascending numbers from 0;
+              needed for current gini impl in decision tree
+            For regression task, the labels (predictions) can be float or double data type.
+  * @param[in] n_unique_labels: (meaningful only for classification) #unique label values (known
+  during preprocessing)
+  * @param[in] forest: CPU point to RandomForestMetaData struct.
+  */
+  void fit(const raft::handle_t& user_handle,
+           const T* input,
+           int n_rows,
+           int n_cols,
+           L* labels,
+           int n_unique_labels,
+           RandomForestMetaData<T, L>*& forest)
+  {
+    raft::common::nvtx::range fun_scope("RandomForest::fit @randomforest.cuh");
+    this->error_checking(input, labels, n_rows, n_cols, false);
+    const raft::handle_t& handle = user_handle;
+    int n_sampled_rows           = 0;
+    if (this->rf_params.bootstrap) {
+      n_sampled_rows = std::round(this->rf_params.max_samples * n_rows);
+    } else {
+      if (this->rf_params.max_samples != 1.0) {
+        CUML_LOG_WARN(
+          "If bootstrap sampling is disabled, max_samples value is ignored and "
+          "whole dataset is used for building each tree");
+        this->rf_params.max_samples = 1.0;
       }
+      n_sampled_rows = n_rows;
+    }
+    int n_streams = this->rf_params.n_streams;
+    ASSERT(static_cast<std::size_t>(n_streams) <= handle.get_stream_pool_size(),
+           "rf_params.n_streams (=%d) should be <= raft::handle_t.n_streams (=%lu)",
+           n_streams,
+           handle.get_stream_pool_size());
+
+    // computing the quantiles: last two return values are shared pointers to device memory
+    // encapsulated by quantiles struct
+    auto [quantiles, quantiles_array, n_bins_array] =
+      DT::computeQuantiles(handle, input, this->rf_params.tree_params.max_n_bins, n_rows, n_cols);
+
+    // n_streams should not be less than n_trees
+    if (this->rf_params.n_trees < n_streams) n_streams = this->rf_params.n_trees;
+
+    // Select n_sampled_rows (with replacement) numbers from [0, n_rows) per tree.
+    // selected_rows: randomly generated IDs for bootstrapped samples (w/ replacement); a device
+    // ptr.
+    // Use a deque instead of vector because it can be used on objects with a deleted copy
+    // constructor
+    std::deque<rmm::device_uvector<int>> selected_rows;
+    for (int i = 0; i < n_streams; i++) {
+      selected_rows.emplace_back(n_sampled_rows, handle.get_stream_from_stream_pool(i));
+    }
 
 #pragma omp parallel for num_threads(n_streams)
+    for (int i = 0; i < this->rf_params.n_trees; i++) {
+      int stream_id = omp_get_thread_num();
+      auto s        = handle.get_stream_from_stream_pool(stream_id);
+
+      this->get_row_sample(i, n_rows, &selected_rows[stream_id], s);
+
+      /* Build individual tree in the forest.
+        - input is a pointer to orig data that have n_cols features and n_rows rows.
+        - n_sampled_rows: # rows sampled for tree's bootstrap sample.
+        - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements)
+          used to build the bootstrapped sample.
+          Expectation: Each tree node will contain (a) # n_sampled_rows and
+          (b) a pointer to a list of row numbers w.r.t original data.
+      */
+
+      forest->trees[i] = DT::DecisionTree::fit(handle,
+                                               s,
+                                               input,
+                                               n_cols,
+                                               n_rows,
+                                               labels,
+                                               &selected_rows[stream_id],
+                                               n_unique_labels,
+                                               this->rf_params.tree_params,
+                                               this->rf_params.seed,
+                                               quantiles,
+                                               i);
+    }
+    // Cleanup
+    handle.sync_stream_pool();
+    handle.sync_stream();
+  }
+
+  /**
+   * @brief Predict target feature for input data
+   * @param[in] user_handle: raft::handle_t.
+   * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
+   * pointer.
+   * @param[in] n_rows: number of  data samples.
+   * @param[in] n_cols: number of features (excluding target feature).
+   * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated.
+   * @param[in] verbosity: verbosity level for logging messages during execution
+   */
+  void predict(const raft::handle_t& user_handle,
+               const T* input,
+               int n_rows,
+               int n_cols,
+               L* predictions,
+               const RandomForestMetaData<T, L>* forest,
+               int verbosity) const
+  {
+    ML::Logger::get().setLevel(verbosity);
+    this->error_checking(input, predictions, n_rows, n_cols, true);
+    std::vector<L> h_predictions(n_rows);
+    cudaStream_t stream = user_handle.get_stream();
+
+    std::vector<T> h_input(std::size_t(n_rows) * n_cols);
+    raft::update_host(h_input.data(), input, std::size_t(n_rows) * n_cols, stream);
+    user_handle.sync_stream(stream);
+
+    int row_size = n_cols;
+
+    ML::PatternSetter _("%v");
+    for (int row_id = 0; row_id < n_rows; row_id++) {
+      std::vector<T> row_prediction(forest->trees[0]->num_outputs);
       for (int i = 0; i < this->rf_params.n_trees; i++) {
-        int stream_id = omp_get_thread_num();
-        auto s        = handle.get_stream_from_stream_pool(stream_id);
-
-        this->get_row_sample(i, n_rows, &selected_rows[stream_id], s);
-
-        /* Build individual tree in the forest.
-          - input is a pointer to orig data that have n_cols features and n_rows rows.
-          - n_sampled_rows: # rows sampled for tree's bootstrap sample.
-          - sorted_selected_rows: points to a list of row #s (w/ n_sampled_rows elements)
-            used to build the bootstrapped sample.
-            Expectation: Each tree node will contain (a) # n_sampled_rows and
-            (b) a pointer to a list of row numbers w.r.t original data.
-        */
-
-        forest->trees[i] = DT::DecisionTree::fit(handle,
-                                                 s,
-                                                 input,
-                                                 n_cols,
-                                                 n_rows,
-                                                 labels,
-                                                 &selected_rows[stream_id],
-                                                 n_unique_labels,
-                                                 this->rf_params.tree_params,
-                                                 this->rf_params.seed,
-                                                 quantiles,
-                                                 i);
+        DT::DecisionTree::predict(user_handle,
+                                  *forest->trees[i],
+                                  &h_input[row_id * row_size],
+                                  1,
+                                  n_cols,
+                                  row_prediction.data(),
+                                  forest->trees[i]->num_outputs,
+                                  verbosity);
       }
-      // Cleanup
-      handle.sync_stream_pool();
-      handle.sync_stream();
-    }
-
-    /**
-     * @brief Predict target feature for input data
-     * @param[in] user_handle: raft::handle_t.
-     * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
-     * pointer.
-     * @param[in] n_rows: number of  data samples.
-     * @param[in] n_cols: number of features (excluding target feature).
-     * @param[in, out] predictions: n_rows predicted labels. GPU pointer, user allocated.
-     * @param[in] verbosity: verbosity level for logging messages during execution
-     */
-    void predict(const raft::handle_t& user_handle,
-                 const T* input,
-                 int n_rows,
-                 int n_cols,
-                 L* predictions,
-                 const RandomForestMetaData<T, L>* forest,
-                 int verbosity) const
-    {
-      ML::Logger::get().setLevel(verbosity);
-      this->error_checking(input, predictions, n_rows, n_cols, true);
-      std::vector<L> h_predictions(n_rows);
-      cudaStream_t stream = user_handle.get_stream();
-
-      std::vector<T> h_input(std::size_t(n_rows) * n_cols);
-      raft::update_host(h_input.data(), input, std::size_t(n_rows) * n_cols, stream);
-      user_handle.sync_stream(stream);
-
-      int row_size = n_cols;
-
-      ML::PatternSetter _("%v");
-      for (int row_id = 0; row_id < n_rows; row_id++) {
-        std::vector<T> row_prediction(forest->trees[0]->num_outputs);
-        for (int i = 0; i < this->rf_params.n_trees; i++) {
-          DT::DecisionTree::predict(user_handle,
-                                    *forest->trees[i],
-                                    &h_input[row_id * row_size],
-                                    1,
-                                    n_cols,
-                                    row_prediction.data(),
-                                    forest->trees[i]->num_outputs,
-                                    verbosity);
-        }
+      for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
+        row_prediction[k] /= this->rf_params.n_trees;
+      }
+      if (rf_type == RF_type::CLASSIFICATION) {  // classification task: use 'majority' prediction
+        L best_class = 0;
+        T best_prob  = 0.0;
         for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
-          row_prediction[k] /= this->rf_params.n_trees;
-        }
-        if (rf_type == RF_type::CLASSIFICATION) {  // classification task: use 'majority' prediction
-          L best_class = 0;
-          T best_prob  = 0.0;
-          for (int k = 0; k < forest->trees[0]->num_outputs; k++) {
-            if (row_prediction[k] > best_prob) {
-              best_class = k;
-              best_prob  = row_prediction[k];
-            }
+          if (row_prediction[k] > best_prob) {
+            best_class = k;
+            best_prob  = row_prediction[k];
           }
-
-          h_predictions[row_id] = best_class;
-        } else {
-          h_predictions[row_id] = row_prediction[0];
         }
-      }
-
-      raft::update_device(predictions, h_predictions.data(), n_rows, stream);
-      user_handle.sync_stream(stream);
-    }
 
-    /**
-     * @brief Predict target feature for input data and score against ref_labels.
-     * @param[in] user_handle: raft::handle_t.
-     * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
-     * pointer.
-     * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer.
-     * @param[in] n_rows: number of  data samples.
-     * @param[in] n_cols: number of features (excluding target feature).
-     * @param[in] predictions: n_rows predicted labels. GPU pointer, user allocated.
-     * @param[in] verbosity: verbosity level for logging messages during execution
-     * @param[in] rf_type: task type: 0 for classification, 1 for regression
-     */
-    static RF_metrics score(const raft::handle_t& user_handle,
-                            const L* ref_labels,
-                            int n_rows,
-                            const L* predictions,
-                            int verbosity,
-                            int rf_type = RF_type::CLASSIFICATION)
-    {
-      ML::Logger::get().setLevel(verbosity);
-      cudaStream_t stream = user_handle.get_stream();
-      RF_metrics stats;
-      if (rf_type == RF_type::CLASSIFICATION) {  // task classifiation: get classification metrics
-        float accuracy = raft::stats::accuracy(predictions, ref_labels, n_rows, stream);
-        stats          = set_rf_metrics_classification(accuracy);
-        if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
-
-        /* TODO: Potentially augment RF_metrics w/ more metrics (e.g., precision, F1, etc.).
-          For non binary classification problems (i.e., one target and  > 2 labels), need avg.
-          for each of these metrics */
-      } else {  // regression task: get regression metrics
-        double mean_abs_error, mean_squared_error, median_abs_error;
-        raft::stats::regression_metrics(predictions,
-                                        ref_labels,
-                                        n_rows,
-                                        stream,
-                                        mean_abs_error,
-                                        mean_squared_error,
-                                        median_abs_error);
-        stats = set_rf_metrics_regression(mean_abs_error, mean_squared_error, median_abs_error);
-        if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
+        h_predictions[row_id] = best_class;
+      } else {
+        h_predictions[row_id] = row_prediction[0];
       }
+    }
 
-      return stats;
+    raft::update_device(predictions, h_predictions.data(), n_rows, stream);
+    user_handle.sync_stream(stream);
+  }
+
+  /**
+   * @brief Predict target feature for input data and score against ref_labels.
+   * @param[in] user_handle: raft::handle_t.
+   * @param[in] input: test data (n_rows samples, n_cols features) in row major format. GPU
+   * pointer.
+   * @param[in] ref_labels: label values for cross validation (n_rows elements); GPU pointer.
+   * @param[in] n_rows: number of  data samples.
+   * @param[in] n_cols: number of features (excluding target feature).
+   * @param[in] predictions: n_rows predicted labels. GPU pointer, user allocated.
+   * @param[in] verbosity: verbosity level for logging messages during execution
+   * @param[in] rf_type: task type: 0 for classification, 1 for regression
+   */
+  static RF_metrics score(const raft::handle_t& user_handle,
+                          const L* ref_labels,
+                          int n_rows,
+                          const L* predictions,
+                          int verbosity,
+                          int rf_type = RF_type::CLASSIFICATION)
+  {
+    ML::Logger::get().setLevel(verbosity);
+    cudaStream_t stream = user_handle.get_stream();
+    RF_metrics stats;
+    if (rf_type == RF_type::CLASSIFICATION) {  // task classifiation: get classification metrics
+      float accuracy = raft::stats::accuracy(predictions, ref_labels, n_rows, stream);
+      stats          = set_rf_metrics_classification(accuracy);
+      if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
+
+      /* TODO: Potentially augment RF_metrics w/ more metrics (e.g., precision, F1, etc.).
+        For non binary classification problems (i.e., one target and  > 2 labels), need avg.
+        for each of these metrics */
+    } else {  // regression task: get regression metrics
+      double mean_abs_error, mean_squared_error, median_abs_error;
+      raft::stats::regression_metrics(predictions,
+                                      ref_labels,
+                                      n_rows,
+                                      stream,
+                                      mean_abs_error,
+                                      mean_squared_error,
+                                      median_abs_error);
+      stats = set_rf_metrics_regression(mean_abs_error, mean_squared_error, median_abs_error);
+      if (ML::Logger::get().shouldLogFor(CUML_LEVEL_DEBUG)) print(stats);
     }
-  };
 
-  // class specializations
-  template class RandomForest<float, int>;
-  template class RandomForest<float, float>;
-  template class RandomForest<double, int>;
-  template class RandomForest<double, double>;
+    return stats;
+  }
+};
+
+// class specializations
+template class RandomForest<float, int>;
+template class RandomForest<float, float>;
+template class RandomForest<double, int>;
+template class RandomForest<double, double>;
 
 }  // End namespace ML
diff --git a/cpp/src/svm/kernelcache.cuh b/cpp/src/svm/kernelcache.cuh
index 5b75ac621f..dea834de6a 100644
--- a/cpp/src/svm/kernelcache.cuh
+++ b/cpp/src/svm/kernelcache.cuh
@@ -30,6 +30,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <cuml/common/logger.hpp>
+
 #include <cub/cub.cuh>
 
 #include <algorithm>
@@ -331,7 +333,7 @@ class KernelCache {
 
   const int TPB = 256;  //!< threads per block for kernels launched
 
-  raft::util::cache::Cache<math_t> cache;
+  raft::cache::Cache<math_t> cache;
 
   cudaStream_t stream;
   SvmType svmType;
diff --git a/cpp/src/svm/svc.cu b/cpp/src/svm/svc.cu
index 37746ad599..5f33333633 100644
--- a/cpp/src/svm/svc.cu
+++ b/cpp/src/svm/svc.cu
@@ -24,6 +24,7 @@
 #include "smosolver.cuh"
 #include "svc_impl.cuh"
 #include <cuml/svm/svc.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
 #include <raft/label/classlabels.cuh>
 #include <raft/linalg/unary_op.cuh>
@@ -32,6 +33,7 @@ namespace ML {
 namespace SVM {
 
 using namespace MLCommon;
+using namespace raft::distance::kernels;
 
 // Explicit instantiation for the library
 template void svcFit<float>(const raft::handle_t& handle,
@@ -40,7 +42,7 @@ template void svcFit<float>(const raft::handle_t& handle,
                             int n_cols,
                             float* labels,
                             const SvmParameter& param,
-                            MLCommon::Matrix::KernelParams& kernel_params,
+                            KernelParams& kernel_params,
                             SvmModel<float>& model,
                             const float* sample_weight);
 
@@ -50,7 +52,7 @@ template void svcFit<double>(const raft::handle_t& handle,
                              int n_cols,
                              double* labels,
                              const SvmParameter& param,
-                             MLCommon::Matrix::KernelParams& kernel_params,
+                             KernelParams& kernel_params,
                              SvmModel<double>& model,
                              const double* sample_weight);
 
@@ -58,7 +60,7 @@ template void svcPredict<float>(const raft::handle_t& handle,
                                 float* input,
                                 int n_rows,
                                 int n_cols,
-                                MLCommon::Matrix::KernelParams& kernel_params,
+                                KernelParams& kernel_params,
                                 const SvmModel<float>& model,
                                 float* preds,
                                 float buffer_size,
@@ -68,7 +70,7 @@ template void svcPredict<double>(const raft::handle_t& handle,
                                  double* input,
                                  int n_rows,
                                  int n_cols,
-                                 MLCommon::Matrix::KernelParams& kernel_params,
+                                 KernelParams& kernel_params,
                                  const SvmModel<double>& model,
                                  double* preds,
                                  double buffer_size,
@@ -82,7 +84,7 @@ template <typename math_t>
 SVC<math_t>::SVC(raft::handle_t& handle,
                  math_t C,
                  math_t tol,
-                 Matrix::KernelParams kernel_params,
+                 KernelParams kernel_params,
                  math_t cache_size,
                  int max_iter,
                  int nochange_steps,
diff --git a/cpp/src/svm/svc_impl.cuh b/cpp/src/svm/svc_impl.cuh
index f1baf827c9..375155b25c 100644
--- a/cpp/src/svm/svc_impl.cuh
+++ b/cpp/src/svm/svc_impl.cuh
@@ -75,7 +75,7 @@ void svcFit(const raft::handle_t& handle,
   ASSERT(model.n_classes == 2, "Only binary classification is implemented at the moment");
 
   rmm::device_uvector<math_t> y(n_rows, stream);
-  MLCommon::Label::getOvrLabels(
+  raft::label::getOvrlabels(
     labels, n_rows, model.unique_labels, model.n_classes, y.data(), 1, stream);
 
   raft::distance::kernels::GramMatrixBase<math_t>* kernel =
diff --git a/cpp/src/svm/svr.cu b/cpp/src/svm/svr.cu
index 870061cf71..9745ae0f0a 100644
--- a/cpp/src/svm/svr.cu
+++ b/cpp/src/svm/svr.cu
@@ -24,13 +24,16 @@
 #include "smosolver.cuh"
 #include "svr_impl.cuh"
 #include <cuml/svm/svc.hpp>
-#include <label/classlabels.cuh>
+#include <raft/distance/distance_types.hpp>
 #include <raft/distance/kernels.cuh>
+#include <raft/label/classlabels.cuh>
 #include <raft/linalg/unary_op.cuh>
 
 namespace ML {
 namespace SVM {
 
+using namespace raft::distance::kernels;
+
 // Explicit instantiation for the library
 template void svrFit<float>(const raft::handle_t& handle,
                             float* X,
@@ -38,7 +41,7 @@ template void svrFit<float>(const raft::handle_t& handle,
                             int n_cols,
                             float* y,
                             const SvmParameter& param,
-                            MLCommon::Matrix::KernelParams& kernel_params,
+                            KernelParams& kernel_params,
                             SvmModel<float>& model,
                             const float* sample_weight);
 
@@ -48,7 +51,7 @@ template void svrFit<double>(const raft::handle_t& handle,
                              int n_cols,
                              double* y,
                              const SvmParameter& param,
-                             MLCommon::Matrix::KernelParams& kernel_params,
+                             KernelParams& kernel_params,
                              SvmModel<double>& model,
                              const double* sample_weight);
 
diff --git a/cpp/src/svm/svr_impl.cuh b/cpp/src/svm/svr_impl.cuh
index b904b8c704..735e5a7e4f 100644
--- a/cpp/src/svm/svr_impl.cuh
+++ b/cpp/src/svm/svr_impl.cuh
@@ -46,7 +46,7 @@ void svrFit(const raft::handle_t& handle,
             int n_cols,
             math_t* y,
             const SvmParameter& param,
-            MLCommon::Matrix::KernelParams& kernel_params,
+            raft::distance::kernels::KernelParams& kernel_params,
             SvmModel<math_t>& model,
             const math_t* sample_weight)
 {
@@ -59,8 +59,9 @@ void svrFit(const raft::handle_t& handle,
   const raft::handle_t& handle_impl = handle;
 
   cudaStream_t stream = handle_impl.get_stream();
-  MLCommon::Matrix::GramMatrixBase<math_t>* kernel =
-    MLCommon::Matrix::KernelFactory<math_t>::create(kernel_params, handle_impl.get_cublas_handle());
+  raft::distance::kernels::GramMatrixBase<math_t>* kernel =
+    raft::distance::kernels::KernelFactory<math_t>::create(kernel_params,
+                                                           handle_impl.get_cublas_handle());
 
   SmoSolver<math_t> smo(handle_impl, param, kernel);
   smo.Solve(X,
diff --git a/cpp/src/tsne/tsne.cu b/cpp/src/tsne/tsne.cu
index 5432910e1a..e9e3fd0fde 100644
--- a/cpp/src/tsne/tsne.cu
+++ b/cpp/src/tsne/tsne.cu
@@ -16,71 +16,71 @@
 
 #include "tsne_runner.cuh"
 #include <cuml/manifold/tsne.h>
-<<<<<<< HEAD
+
+#if defined RAFT_DISTANCE_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
-=======
-#include <raft/distance/distance_type.hpp>
->>>>>>> branch-22.10
+#endif
+
+#include <raft/distance/distance_types.hpp>
 
-namespace ML
+namespace ML {
+template <typename tsne_input, typename value_idx, typename value_t>
+value_t _fit(const raft::handle_t& handle,
+             tsne_input& input,
+             knn_graph<value_idx, value_t>& k_graph,
+             TSNEParams& params)
 {
-  template <typename tsne_input, typename value_idx, typename value_t>
-  value_t _fit(const raft::handle_t& handle,
-               tsne_input& input,
-               knn_graph<value_idx, value_t>& k_graph,
-               TSNEParams& params)
-  {
-    TSNE_runner<tsne_input, value_idx, value_t> runner(handle, input, k_graph, params);
+  TSNE_runner<tsne_input, value_idx, value_t> runner(handle, input, k_graph, params);
 
-    return runner.run();  // returns the Kullback–Leibler divergence
-  }
+  return runner.run();  // returns the Kullback–Leibler divergence
+}
 
-  void TSNE_fit(const raft::handle_t& handle,
-                float* X,
-                float* Y,
-                int n,
-                int p,
-                int64_t* knn_indices,
-                float* knn_dists,
-                TSNEParams& params,
-                float* kl_div)
-  {
-    ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && X != NULL && Y != NULL,
-           "Wrong input args");
+void TSNE_fit(const raft::handle_t& handle,
+              float* X,
+              float* Y,
+              int n,
+              int p,
+              int64_t* knn_indices,
+              float* knn_dists,
+              TSNEParams& params,
+              float* kl_div)
+{
+  ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && X != NULL && Y != NULL,
+         "Wrong input args");
 
-    manifold_dense_inputs_t<float> input(X, Y, n, p);
-    knn_graph<int64_t, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
+  manifold_dense_inputs_t<float> input(X, Y, n, p);
+  knn_graph<int64_t, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
 
-    float kl_div_v = _fit<manifold_dense_inputs_t<float>, knn_indices_dense_t, float>(
-      handle, input, k_graph, params);
+  float kl_div_v = _fit<manifold_dense_inputs_t<float>, knn_indices_dense_t, float>(
+    handle, input, k_graph, params);
 
-    if (kl_div) { *kl_div = kl_div_v; }
-  }
+  if (kl_div) { *kl_div = kl_div_v; }
+}
 
-  void TSNE_fit_sparse(const raft::handle_t& handle,
-                       int* indptr,
-                       int* indices,
-                       float* data,
-                       float* Y,
-                       int nnz,
-                       int n,
-                       int p,
-                       int* knn_indices,
-                       float* knn_dists,
-                       TSNEParams& params,
-                       float* kl_div)
-  {
-    ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && indptr != NULL &&
-             indices != NULL && data != NULL && Y != NULL,
-           "Wrong input args");
+void TSNE_fit_sparse(const raft::handle_t& handle,
+                     int* indptr,
+                     int* indices,
+                     float* data,
+                     float* Y,
+                     int nnz,
+                     int n,
+                     int p,
+                     int* knn_indices,
+                     float* knn_dists,
+                     TSNEParams& params,
+                     float* kl_div)
+{
+  ASSERT(n > 0 && p > 0 && params.dim > 0 && params.n_neighbors > 0 && indptr != NULL &&
+           indices != NULL && data != NULL && Y != NULL,
+         "Wrong input args");
 
-    manifold_sparse_inputs_t<int, float> input(indptr, indices, data, Y, nnz, n, p);
-    knn_graph<int, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
+  manifold_sparse_inputs_t<int, float> input(indptr, indices, data, Y, nnz, n, p);
+  knn_graph<int, float> k_graph(n, params.n_neighbors, knn_indices, knn_dists);
 
-    float kl_div_v = _fit<manifold_sparse_inputs_t<int, float>, knn_indices_sparse_t, float>(
-      handle, input, k_graph, params);
+  float kl_div_v = _fit<manifold_sparse_inputs_t<int, float>, knn_indices_sparse_t, float>(
+    handle, input, k_graph, params);
 
-    if (kl_div) { *kl_div = kl_div_v; }
-  }
+  if (kl_div) { *kl_div = kl_div_v; }
+}
 
 }  // namespace ML
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
index b8bacf6945..85b98c46b8 100644
--- a/cpp/src/umap/knn_graph/algo.cuh
+++ b/cpp/src/umap/knn_graph/algo.cuh
@@ -21,17 +21,17 @@
 #include <cuml/neighbors/knn_sparse.hpp>
 #include <iostream>
 #include <raft/distance/distance_type.hpp>
-<<<<<<< HEAD
-#include <raft/linalg/unary_op.hpp>
-#include <raft/sparse/selection/knn.hpp>
-#include <raft/spatial/knn/specializations.hpp>
-=======
+
 #include <raft/linalg/unary_op.cuh>
 #include <raft/sparse/selection/knn.cuh>
->>>>>>> branch-22.10
-#include <selection/knn.cuh>
 
-#include <raft/core/cudart_utils.hpp>
+#if defined RAFT_DISTANCE_COMPILED
+#include <raft/spatial/knn/specializations.hpp>
+#endif
+
+#include <raft/spatial/knn/knn.cuh>
+
+#include <raft/util/cudart_utils.hpp>
 
 #include <raft/core/error.hpp>
 
@@ -39,7 +39,7 @@ namespace UMAPAlgo {
 namespace kNNGraph {
 namespace Algo {
 
-  /**
+/**
  * Initial implementation calls out to FAISS to do its work.
  */
 
@@ -67,17 +67,6 @@ inline void launcher(const raft::handle_t& handle,
   ptrs[0]  = inputsA.X;
   sizes[0] = inputsA.n;
 
-<<<<<<< HEAD
-  raft::spatial::knn::brute_force_knn<long, float, int>(handle,
-                                                        ptrs,
-                                                        sizes,
-                                                        inputsA.d,
-                                                        inputsB.X,
-                                                        inputsB.n,
-                                                        out.knn_indices,
-                                                        out.knn_dists,
-                                                        n_neighbors);
-=======
   raft::spatial::knn::brute_force_knn(handle,
                                       ptrs,
                                       sizes,
@@ -89,7 +78,6 @@ inline void launcher(const raft::handle_t& handle,
                                       n_neighbors,
                                       params->metric,
                                       params->p);
->>>>>>> branch-22.10
 }
 
 // Instantiation for dense inputs, int indices
@@ -177,5 +165,4 @@ inline void launcher(const raft::handle_t& handle,
 
 }  // namespace Algo
 }  // namespace kNNGraph
-}
-;  // namespace UMAPAlgo
+};  // namespace UMAPAlgo
diff --git a/cpp/src_prims/cache/cache.cuh b/cpp/src_prims/cache/cache.cuh
deleted file mode 100644
index 958835be8d..0000000000
--- a/cpp/src_prims/cache/cache.cuh
+++ /dev/null
@@ -1,410 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "cache_util.cuh"
-#include <cub/cub.cuh>
-
-#include <cuml/common/logger.hpp>
-
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/interruptible.hpp>
-#include <raft/cuda_utils.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-#include <cstddef>
-
-namespace MLCommon {
-namespace Cache {
-
-/**
- * @brief Associative cache with least recently used replacement policy.
- *
- * SW managed cache in device memory, for ML algos where we can trade memory
- * access for computation. The two main functions of this class are the
- * management of cache indices, and methods to retrieve/store data using the
- * cache indices.
- *
- * The index management can be considered as a hash map<int, int>, where the int
- * keys are the original vector indices that we want to store, and the values are
- * the cache location of these vectors. The keys are hashed into a bucket
- * whose size equals the associativity. These are the cache sets. If a cache
- * set is full, then new indices are stored by replacing the oldest entries.
- *
- * Using this index mapping we implement methods to store and retrive data from
- * the cache buffer, where a unit of data that we are storing is math_t[n_vec].
- * For example in SVM we store full columns of the kernel matrix at each cache
- * entry.
- *
- * Note: we should have a look if the index management could be simplified using
- * concurrent_unordered_map.cuh from cudf. See Issue #914.
- *
- * Example usage:
- * @code{.cpp}
- *
- * // An expensive calculation that we want to accelerate with caching:
- * // we have n keys, and for each key we generate a vector with m elements.
- * // The keys and the output values are stored in GPU memory.
- * void calc(int *key, int n, int m, float *out, cudaStream_t stream) {
- *   for (k=0; k<n; k++) {
- *     // use key[k] to generate out[i + m*k],  where i=0..m-1
- *   }
- * }
- *
- * // We assume that our ML algo repeatedly calls calc, and the set of keys have
- * // an overlap. We will use the cache to avoid repeated calculations.
- *
- * // Assume we have raft::handle_t& h, and cudaStream_t stream
- * Cache<float> cache(h.get_device_allocator(), stream, m);
- *
- * // A buffer that we will reuse to store the cache indices.
- * rmm::device_uvector<int> cache_idx(h.get_device_allocator(), stream, n);
- *
- * void cached_calc(int *key, int n, int m, float *out, stream) {
- *   int n_cached = 0;
- *
- *   cache.GetCacheIdxPartitioned(key, n, cache_idx.data(), &n_cached,
- *                                cudaStream_t stream);
- *
- *   // Note: GetCacheIdxPartitioned has reordered the keys so that
- *   // key[0..n_cached-1] are the keys already in the cache.
- *   // We collect the corresponding values
- *   cache.GetVecs(cache_idx.data(), n_cached, out, stream);
- *
- *   // Calculate the elements not in the cache
- *   int non_cached = n - n_cached;
- *   if (non_cached > 0) {
- *     int *key_new = key + n_cached;
- *     int *cache_idx_new = cache_idx.data() + n_cached;
- *     float *out_new = out + n_cached * m;
- *     // AssignCacheIdx can permute the keys, therefore it has to come before
- *     // we call calc.
- *     // Note: a call to AssignCacheIdx should always be preceded with
- *     // GetCacheIdxPartitioned, because that initializes the cache_idx_new array
- *     // with the cache set (hash bucket) that correspond to the keys.
- *     // The cache idx will be assigned from that cache set.
- *     cache.AssignCacheIdx(key_new, non_cached, cache_idx_new, stream);
- *
- *     calc(key_new, non_cached, m, out_new, stream);
- *
- *     // Store the calculated vectors into the cache.
- *     cache.StoreVecs(out_new, non_cached, non_cached, cache_idx_new, stream);
- *    }
- * }
- * @endcode
- */
-template <typename math_t, int associativity = 32>
-class Cache {
- public:
-  /**
-   * @brief Construct a Cache object
-   *
-   * @tparam math_t type of elements to be cached
-   * @tparam associativity number of vectors in a cache set
-   *
-   * @param stream cuda stream
-   * @param n_vec number of elements in a single vector that is stored in a
-   *   cache entry
-   * @param cache_size in MiB
-   */
-  Cache(cudaStream_t stream, int n_vec, float cache_size = 200)
-    : n_vec(n_vec),
-      cache_size(cache_size),
-      cache(0, stream),
-      cached_keys(0, stream),
-      cache_time(0, stream),
-      is_cached(0, stream),
-      ws_tmp(0, stream),
-      idx_tmp(0, stream),
-      d_num_selected_out(stream),
-      d_temp_storage(0, stream)
-  {
-    ASSERT(n_vec > 0, "Parameter n_vec: shall be larger than zero");
-    ASSERT(associativity > 0, "Associativity shall be larger than zero");
-    ASSERT(cache_size >= 0, "Cache size should not be negative");
-
-    // Calculate how many vectors would fit the cache
-    int n_cache_vecs = (cache_size * 1024 * 1024) / (sizeof(math_t) * n_vec);
-
-    // The available memory shall be enough for at least one cache set
-    if (n_cache_vecs >= associativity) {
-      n_cache_sets = n_cache_vecs / associativity;
-      n_cache_vecs = n_cache_sets * associativity;
-      cache.resize(n_cache_vecs * n_vec, stream);
-      cached_keys.resize(n_cache_vecs, stream);
-      cache_time.resize(n_cache_vecs, stream);
-      RAFT_CUDA_TRY(
-        cudaMemsetAsync(cached_keys.data(), 0, cached_keys.size() * sizeof(int), stream));
-      RAFT_CUDA_TRY(cudaMemsetAsync(cache_time.data(), 0, cache_time.size() * sizeof(int), stream));
-    } else {
-      if (cache_size > 0) {
-        CUML_LOG_WARN(
-          "Warning: not enough memory to cache a single set of "
-          "rows, not using cache");
-      }
-      n_cache_sets = 0;
-      cache_size   = 0;
-    }
-    CUML_LOG_DEBUG(
-      "Creating cache with size=%f MiB, to store %d vectors, in "
-      "%d sets with associativity=%d",
-      cache_size,
-      n_cache_vecs,
-      n_cache_sets,
-      associativity);
-  }
-
-  Cache(const Cache& other) = delete;
-
-  Cache& operator=(const Cache& other) = delete;
-
-  /** @brief Collect cached data into contiguous memory space.
-   *
-   * On exit, the tile array is filled the following way:
-   * out[i + n_vec*k] = cache[i + n_vec * idx[k]]), where i=0..n_vec-1,
-   * k = 0..n-1
-   *
-   * Idx values less than 0 are ignored.
-   *
-   * @param [in] idx cache indices, size [n]
-   * @param [in] n the number of vectors that need to be collected
-   * @param [out] out vectors collected from cache, size [n_vec*n]
-   * @param [in] stream cuda stream
-   */
-  void GetVecs(const int* idx, int n, math_t* out, cudaStream_t stream)
-  {
-    if (n > 0) {
-      get_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(cache.data(), n_vec, idx, n, out);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-    }
-  }
-
-  /** @brief Store vectors of data into the cache.
-   *
-   * Roughly the opposite of GetVecs, but the input vectors can be scattered
-   * in memory. The cache is updated using the following formula:
-   *
-   * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec],
-   * for i=0..n_vec-1, k=0..n-1
-   *
-   * If tile_idx==nullptr, then we assume tile_idx[k] = k.
-   *
-   * Elements within a vector should be contiguous in memory (i.e. column vectors
-   * for column major data storage, or row vectors of row major data).
-   *
-   * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile]
-   * @param [in] n_tile number of vectors in tile (at least n)
-   * @param [in] n number of vectors that need to be stored in the cache (a subset
-   *   of all the vectors in the tile)
-   * @param [in] cache_idx cache indices for storing the vectors (negative values
-   *   are ignored), size [n]
-   * @param [in] stream cuda stream
-   * @param [in] tile_idx indices of vectors that need to be stored
-   */
-  void StoreVecs(const math_t* tile,
-                 int n_tile,
-                 int n,
-                 int* cache_idx,
-                 cudaStream_t stream,
-                 const int* tile_idx = nullptr)
-  {
-    if (n > 0) {
-      store_vecs<<<raft::ceildiv(n * n_vec, TPB), TPB, 0, stream>>>(
-        tile, n_tile, n_vec, tile_idx, n, cache_idx, cache.data(), cache.size() / n_vec);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-    }
-  }
-
-  /** @brief Map a set of keys to cache indices.
-   *
-   * For each k in 0..n-1, if keys[k] is found in the cache, then cache_idx[k]
-   * will tell the corresponding cache idx, and is_cached[k] is set to true.
-   *
-   * If keys[k] is not found in the cache, then is_cached[k] is set to false.
-   * In this case we assign the cache set for keys[k], and cache_idx[k] will
-   * store the cache set.
-   *
-   * @note in order to retrieve the cached vector j=cache_idx[k] from the cache,
-   *  we have to access cache[i + j*n_vec], where i=0..n_vec-1.
-   *
-   * @note: do not use simultaneous GetCacheIdx and AssignCacheIdx
-   *
-   * @param [in] keys device array of keys, size [n]
-   * @param [in] n number of keys
-   * @param [out] cache_idx device array of cache indices corresponding to the
-   *   input keys, size [n]
-   * @param [out] is_cached whether the element is already available in the
-   *   cache, size [n]
-   * @param [in] stream
-   */
-  void GetCacheIdx(int* keys, int n, int* cache_idx, bool* is_cached, cudaStream_t stream)
-  {
-    n_iter++;  // we increase the iteration counter, that is used to time stamp
-    // accessing entries from the cache
-    get_cache_idx<<<raft::ceildiv(n, TPB), TPB, 0, stream>>>(keys,
-                                                             n,
-                                                             cached_keys.data(),
-                                                             n_cache_sets,
-                                                             associativity,
-                                                             cache_time.data(),
-                                                             cache_idx,
-                                                             is_cached,
-                                                             n_iter);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
-
-  /** @brief Map a set of keys to cache indices.
-   *
-   * Same as GetCacheIdx, but partitions the keys, and cache_idx arrays in a way
-   * that keys[0..n_cached-1] and cache_idx[0..n_cached-1] store the indices of
-   * vectors that are found in the cache, while keys[n_cached..n-1] are the
-   * indices of vectors that are not found in the cache. For the vectors not
-   * found in the cache, cache_idx[n_cached..n-1] stores the cache set, and this
-   * can be used to call AssignCacheIdx.
-   *
-   * @param [inout] keys device array of keys, size [n]
-   * @param [in] n number of indices
-   * @param [out] cache_idx device array of cache indices corresponding to
-   *   the input keys, size [n]
-   * @param [out] n_cached number of elements that are cached
-   * @param [in] stream cuda stream
-   */
-  void GetCacheIdxPartitioned(int* keys, int n, int* cache_idx, int* n_cached, cudaStream_t stream)
-  {
-    ResizeTmpBuffers(n, stream);
-
-    GetCacheIdx(keys, n, ws_tmp.data(), is_cached.data(), stream);
-
-    // Group cache indices as [already cached, non_cached]
-    cub::DevicePartition::Flagged(d_temp_storage.data(),
-                                  d_temp_storage_size,
-                                  ws_tmp.data(),
-                                  is_cached.data(),
-                                  cache_idx,
-                                  d_num_selected_out.data(),
-                                  n,
-                                  stream);
-
-    raft::update_host(n_cached, d_num_selected_out.data(), 1, stream);
-
-    // Similarily re-group the input indices
-    raft::copy(ws_tmp.data(), keys, n, stream);
-    cub::DevicePartition::Flagged(d_temp_storage.data(),
-                                  d_temp_storage_size,
-                                  ws_tmp.data(),
-                                  is_cached.data(),
-                                  keys,
-                                  d_num_selected_out.data(),
-                                  n,
-                                  stream);
-
-    raft::interruptible::synchronize(stream);
-  }
-
-  /**
-   * @brief Assign cache location to a set of keys.
-   *
-   * Note: call GetCacheIdx first, to get the cache_set assigned to the keys.
-   * Keys that cannot be cached are assigned to -1.
-   *
-   * @param [inout] keys device array of keys, size [n]
-   * @param [in] n number of elements that we want to cache
-   * @param [inout] cidx on entry: cache_set, on exit: assigned cache_idx or -1,
-   *   size[n]
-   * @param [in] stream cuda stream
-   */
-  void AssignCacheIdx(int* keys, int n, int* cidx, cudaStream_t stream)
-  {
-    if (n <= 0) return;
-    cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
-                                    d_temp_storage_size,
-                                    cidx,
-                                    ws_tmp.data(),
-                                    keys,
-                                    idx_tmp.data(),
-                                    n,
-                                    0,
-                                    sizeof(int) * 8,
-                                    stream);
-
-    raft::copy(keys, idx_tmp.data(), n, stream);
-
-    // set it to -1
-    RAFT_CUDA_TRY(cudaMemsetAsync(cidx, 255, n * sizeof(int), stream));
-    const int nthreads = associativity <= 32 ? associativity : 32;
-
-    assign_cache_idx<nthreads, associativity><<<n_cache_sets, nthreads, 0, stream>>>(
-      keys, n, ws_tmp.data(), cached_keys.data(), n_cache_sets, cache_time.data(), n_iter, cidx);
-
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    if (debug_mode) RAFT_CUDA_TRY(cudaDeviceSynchronize());
-  }
-
-  /** Return approximate cache size in MiB. */
-  float GetSizeInMiB() const { return cache_size; }
-
-  /**
-   * Returns the number of vectors that can be cached.
-   */
-  int GetSize() const { return cached_keys.size(); }
-
- private:
-  int n_vec;         //!< Number of elements in a cached vector
-  float cache_size;  //!< in MiB
-  int n_cache_sets;  //!< number of cache sets
-
-  const int TPB = 256;  //!< threads per block for kernel launch
-  int n_iter    = 0;    //!< Counter for time stamping cache operation
-
-  bool debug_mode = false;
-
-  rmm::device_uvector<math_t> cache;     //!< The value of cached vectors
-  rmm::device_uvector<int> cached_keys;  //!< Keys stored at each cache loc
-  rmm::device_uvector<int> cache_time;   //!< Time stamp for LRU cache
-
-  // Helper arrays for GetCacheIdx
-  rmm::device_uvector<bool> is_cached;
-  rmm::device_uvector<int> ws_tmp;
-  rmm::device_uvector<int> idx_tmp;
-
-  // Helper arrays for cub
-  rmm::device_scalar<int> d_num_selected_out;
-  rmm::device_uvector<char> d_temp_storage;
-  size_t d_temp_storage_size = 0;
-
-  void ResizeTmpBuffers(int n, cudaStream_t stream)
-  {
-    if (ws_tmp.size() < static_cast<std::size_t>(n)) {
-      ws_tmp.resize(n, stream);
-      is_cached.resize(n, stream);
-      idx_tmp.resize(n, stream);
-      cub::DevicePartition::Flagged(NULL,
-                                    d_temp_storage_size,
-                                    cached_keys.data(),
-                                    is_cached.data(),
-                                    cached_keys.data(),
-                                    d_num_selected_out.data(),
-                                    n,
-                                    stream);
-      d_temp_storage.resize(d_temp_storage_size, stream);
-    }
-  }
-};
-
-};  // namespace Cache
-};  // namespace MLCommon
diff --git a/cpp/src_prims/cache/cache_util.cuh b/cpp/src_prims/cache/cache_util.cuh
deleted file mode 100644
index 7c18f6f0d5..0000000000
--- a/cpp/src_prims/cache/cache_util.cuh
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <selection/kselection.cuh>
-
-#include <raft/cuda_utils.cuh>
-
-#include <cub/cub.cuh>
-
-namespace MLCommon {
-namespace Cache {
-
-/**
- * @brief Collect vectors of data from the cache into a contiguous memory buffer.
- *
- * We assume contiguous memory layout for the output buffer, i.e. we get
- * column vectors into a column major out buffer, or row vectors into a row
- * major output buffer.
- *
- * On exit, the output array is filled the following way:
- * out[i + n_vec*k] = cache[i + n_vec * cache_idx[k]]), where i=0..n_vec-1, and
- *   k = 0..n-1 where cache_idx[k] >= 0
- *
- *  We ignore vectors where cache_idx[k] < 0.
- *
- * @param [in] cache stores the cached data, size [n_vec x n_cached_vectors]
- * @param [in] n_vec number of elements in a cached vector
- * @param [in] cache_idx cache indices, size [n]
- * @param [in] n the number of elements that need to be collected
- * @param [out] out vectors collected from the cache, size [n_vec * n]
- */
-template <typename math_t>
-__global__ void get_vecs(const math_t* cache, int n_vec, const int* cache_idx, int n, math_t* out)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;  // row idx
-  if (tid < n_vec * n) {
-    size_t out_col   = tid / n_vec;  // col idx
-    size_t cache_col = cache_idx[out_col];
-    if (cache_idx[out_col] >= 0) {
-      if (row + out_col * n_vec < (size_t)n_vec * n) { out[tid] = cache[row + cache_col * n_vec]; }
-    }
-  }
-}
-
-/**
- * @brief Store vectors of data into the cache.
- *
- * Elements within a vector should be contiguous in memory (i.e. column vectors
- * for column major data storage, or row vectors of row major data).
- *
- * If tile_idx==nullptr then the operation is the opposite of get_vecs,
- * i.e. we store
- * cache[i + cache_idx[k]*n_vec] = tile[i + k*n_vec], for i=0..n_vec-1, k=0..n-1
- *
- * If tile_idx != nullptr, then  we permute the vectors from tile according
- * to tile_idx. This allows to store vectors from a buffer where the individual
- * vectors are not stored contiguously (but the elements of each vector shall
- * be contiguous):
- * cache[i + cache_idx[k]*n_vec] = tile[i + tile_idx[k]*n_vec],
- * for i=0..n_vec-1, k=0..n-1
- *
- * @param [in] tile stores the data to be cashed cached, size [n_vec x n_tile]
- * @param [in] n_tile number of vectors in the input tile
- * @param [in] n_vec number of elements in a cached vector
- * @param [in] tile_idx indices of vectors that we want to store
- * @param [in] n number of vectos that we want to store (n <= n_tile)
- * @param [in] cache_idx cache indices, size [n], negative values are ignored
- * @param [inout] cache updated cache
- * @param [in] n_cache_vecs
- */
-template <typename math_t>
-__global__ void store_vecs(const math_t* tile,
-                           int n_tile,
-                           int n_vec,
-                           const int* tile_idx,
-                           int n,
-                           const int* cache_idx,
-                           math_t* cache,
-                           int n_cache_vecs)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  int row = tid % n_vec;  // row idx
-  if (tid < n_vec * n) {
-    int tile_col  = tid / n_vec;  // col idx
-    int data_col  = tile_idx ? tile_idx[tile_col] : tile_col;
-    int cache_col = cache_idx[tile_col];
-
-    // We ignore negative values. The rest of the checks should be fulfilled
-    // if the cache is used properly
-    if (cache_col >= 0 && cache_col < n_cache_vecs && data_col < n_tile) {
-      cache[row + (size_t)cache_col * n_vec] = tile[row + (size_t)data_col * n_vec];
-    }
-  }
-}
-
-/**
- * Map a key to a cache set.
- */
-int DI hash(int key, int n_cache_sets, int associativity) { return key % n_cache_sets; }
-
-/**
- * @brief Binary search to find the first element in the array which is greater
- * equal than a given value.
- * @param [in] array sorted array of n numbers
- * @param [in] n length of the array
- * @param [in] val the value to search for
- * @return the index of the first element in the array for which
- * array[idx] >= value. If there is no such value, then return n.
- */
-int DI arg_first_ge(const int* array, int n, int val)
-{
-  int start = 0;
-  int end   = n - 1;
-  if (array[0] == val) return 0;
-  if (array[end] < val) return n;
-  while (start + 1 < end) {
-    int q = (start + end + 1) / 2;
-    // invariants:
-    // start < end
-    // start < q <=end
-    // array[start] < val && array[end] <=val
-    // at every iteration d = end-start is decreasing
-    // when d==0, then array[end] will be the first element >= val.
-    if (array[q] >= val) {
-      end = q;
-    } else {
-      start = q;
-    }
-  }
-  return end;
-}
-/**
- * @brief Find the k-th occurrence of value in a sorted array.
- *
- * Assume that array is [0, 1, 1, 1, 2, 2, 4, 4, 4, 4, 6, 7]
- * then find_nth_occurrence(cset, 12, 4, 2) == 7, because cset_array[7] stores
- * the second element with value = 4.
- * If there are less then k values in the array, then return -1
- *
- * @param [in] array sorted array of numbers, size [n]
- * @param [in] n number of elements in the array
- * @param [in] val the value we are searching for
- * @param [in] k
- * @return the idx of the k-th occurance of val in array, or -1 if
- * the value is not found.
- */
-int DI find_nth_occurrence(const int* array, int n, int val, int k)
-{
-  int q = arg_first_ge(array, n, val);
-  if (q + k < n && array[q + k] == val) {
-    q += k;
-  } else {
-    q = -1;
-  }
-  return q;
-}
-
-/**
- * @brief Rank the entries in a cache set according the time stamp, return the
- * indices that would sort the time stamp in ascending order.
- *
- * Assume we have a single cache set with time stamps as:
- * key (threadIdx.x):   0   1   2   3
- * val (time stamp):    8   6   7   5
- *
- * The corresponding sorted key-value pairs:
- * key:    3   1   2   0
- * val:    5   6   7   8
- * rank: 0th 1st 2nd 3rd
- *
- * On return, the rank is assigned for each thread:
- * threadIdx.x: 0   1   2   3
- * rank:        3   1   2   0
- *
- * For multiple cache sets, launch one block per cache set.
- *
- * @tparam nthreads number of threads per block (nthreads <= associativity)
- * @tparam associativity number of items in a cache set
- *
- * @param [in] cache_time time stamp of caching the data,
-     size [associativity * n_cache_sets]
- * @param [in] n_cache_sets number of cache sets
- * @param [out] rank within the cache set size [nthreads * items_per_thread]
- *   Each block should give a different pointer for rank.
- */
-template <int nthreads, int associativity>
-DI void rank_set_entries(const int* cache_time, int n_cache_sets, int* rank)
-{
-  const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  typedef cub::BlockRadixSort<int, nthreads, items_per_thread, int> BlockRadixSort;
-  __shared__ typename BlockRadixSort::TempStorage temp_storage;
-
-  int key[items_per_thread];
-  int val[items_per_thread];
-
-  int block_offset = blockIdx.x * associativity;
-
-  for (int j = 0; j < items_per_thread; j++) {
-    int k  = threadIdx.x + j * nthreads;
-    int t  = (k < associativity) ? cache_time[block_offset + k] : 32768;
-    key[j] = t;
-    val[j] = k;
-  }
-
-  BlockRadixSort(temp_storage).Sort(key, val);
-
-  for (int j = 0; j < items_per_thread; j++) {
-    if (val[j] < associativity) { rank[val[j]] = threadIdx.x * items_per_thread + j; }
-  }
-  __syncthreads();
-}
-
-/**
- * @brief Assign cache location to a set of keys using LRU replacement policy.
- *
- * The keys and the corresponding cache_set arrays shall be sorted according
- * to cache_set in ascending order. One block should be launched for every cache
- * set.
- *
- * Each cache set is sorted according to time_stamp, and values from keys
- * are filled in starting at the oldest time stamp. Enties that were accessed
- * at the current time are not reassigned.
- *
- * @tparam nthreads number of threads per block
- * @tparam assaciativity number of keys in a cache set
- *
- * @param [in] keys that we want to cache size [n]
- * @param [in] n number of keys
- * @param [in] cache_set assigned to keys, size [n]
- * @param [inout] cached_keys keys of already cached vectors,
- *   size [n_cache_sets*associativity], on exit it will be updated with the
- *   cached elements from keys.
- * @param [in] n_cache_sets number of cache sets
- * @param [inout] cache_time will be updated to "time" for those elements that
- *   could be assigned to a cache location, size [n_cache_sets*associativity]
- * @param [in] time time stamp
- * @param [out] cache_idx the cache idx assigned to the input, or -1 if it could
- *   not be cached, size [n]
- */
-template <int nthreads, int associativity>
-__global__ void assign_cache_idx(const int* keys,
-                                 int n,
-                                 const int* cache_set,
-                                 int* cached_keys,
-                                 int n_cache_sets,
-                                 int* cache_time,
-                                 int time,
-                                 int* cache_idx)
-{
-  int block_offset = blockIdx.x * associativity;
-
-  const int items_per_thread = raft::ceildiv(associativity, nthreads);
-
-  // the size of rank limits how large associativity can be used in practice
-  __shared__ int rank[items_per_thread * nthreads];
-  rank_set_entries<nthreads, associativity>(cache_time, n_cache_sets, rank);
-
-  // Each thread will fill items_per_thread items in the cache.
-  // It uses a place, only if it was not udated at the current time step
-  // (cache_time != time).
-  // We rank the places acconding to the time stamp, least recently used
-  // elements come to the front.
-  // We fill the least recently used elements with the working set.
-  // there might be elements which cannot be assigned to cache loc.
-  // these elements are assigned -1.
-
-  for (int j = 0; j < items_per_thread; j++) {
-    int i     = threadIdx.x + j * nthreads;
-    int t_idx = block_offset + i;
-    bool mask = (i < associativity);
-    // whether this slot is available for writing
-    mask = mask && (cache_time[t_idx] != time);
-
-    // rank[i] tells which element to store by this thread
-    // we look up where is the corresponding key stored in the input array
-    if (mask) {
-      int k = find_nth_occurrence(cache_set, n, blockIdx.x, rank[i]);
-      if (k > -1) {
-        int key_val        = keys[k];
-        cached_keys[t_idx] = key_val;
-        cache_idx[k]       = t_idx;
-        cache_time[t_idx]  = time;
-      }
-    }
-  }
-}
-
-/* Unnamed namespace is used to avoid multiple definition error for the
-  following non-template function */
-namespace {
-/**
- * @brief Get the cache indices for keys stored in the cache.
- *
- * For every key, we look up the corresponding cache position.
- * If keys[k] is stored in the cache, then is_cached[k] is set to true, and
- * cache_idx[k] stores the corresponding cache idx.
- *
- * If keys[k] is not stored in the cache, then we assign a cache set to it.
- * This  cache set is stored in cache_idx[k], and is_cached[k] is set to false.
- * In this case AssignCacheIdx should be called, to get an assigned position
- * within the cache set.
- *
- * Cache_time is assigned to the time input argument for all elements in idx.
- *
- * @param [in] keys array of keys that we want to look up in the cache, size [n]
- * @param [in] n number of keys to look up
- * @param [inout] cached_keys keys stored in the cache, size [n_cache_sets * associativity]
- * @param [in] n_cache_sets number of cache sets
- * @param [in] associativity number of keys in cache set
- * @param [inout] cache_time time stamp when the indices were cached, size [n_cache_sets *
- * associativity]
- * @param [out] cache_idx cache indices of the working set elements, size [n]
- * @param [out] is_cached whether the element is cached size[n]
- * @param [in] time iteration counter (used for time stamping)
- */
-__global__ void get_cache_idx(int* keys,
-                              int n,
-                              int* cached_keys,
-                              int n_cache_sets,
-                              int associativity,
-                              int* cache_time,
-                              int* cache_idx,
-                              bool* is_cached,
-                              int time)
-{
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  if (tid < n) {
-    int widx   = keys[tid];
-    int sidx   = hash(widx, n_cache_sets, associativity);
-    int cidx   = sidx * associativity;
-    int i      = 0;
-    bool found = false;
-    // search for empty spot and the least recently used spot
-    while (i < associativity && !found) {
-      found = (cache_time[cidx + i] > 0 && cached_keys[cidx + i] == widx);
-      i++;
-    }
-    is_cached[tid] = found;
-    if (found) {
-      cidx             = cidx + i - 1;
-      cache_time[cidx] = time;  // update time stamp
-      cache_idx[tid]   = cidx;  // exact cache idx
-    } else {
-      cache_idx[tid] = sidx;  // assign cache set
-    }
-  }
-}
-};  // end unnamed namespace
-};  // end namespace Cache
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/label/classlabels.cuh b/cpp/src_prims/label/classlabels.cuh
deleted file mode 100644
index ecb4e55871..0000000000
--- a/cpp/src_prims/label/classlabels.cuh
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cub/cub.cuh>
-
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/label/classlabels.cuh>
-#include <raft/linalg/unary_op.cuh>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Label {
-
-using namespace MLCommon;
-
-/**
- * Get uniuque class labels.
- *
- * The y array is assumed to store class labels. The unique values are selected
- * from this array.
- *
- * \tparam math_t numeric type of the arrays with class labels
- * \param [in] y device array of labels, size [n]
- * \param [in] n number of labels
- * \param [out] unique device array of unique labels, needs to be pre-allocated
- * \param [in] stream cuda stream
- */
-template <typename math_t>
-int getUniqueLabels(math_t* y, size_t n, math_t* unique, cudaStream_t stream)
-{
-  rmm::device_uvector<math_t> unique_v(0, stream);
-  auto n_unique = raft::label::getUniquelabels(unique_v, y, n, stream);
-  raft::copy(unique, unique_v.data(), n_unique, stream);
-  raft::interruptible::synchronize(stream);
-  return n_unique;
-}
-
-/**
- * Assign one versus rest labels.
- *
- * The output labels will have values +/-1:
- * y_out = (y == y_unique[idx]) ? +1 : -1;
- *
- * The output type currently is set to math_t, but for SVM in principle we are
- * free to choose other type for y_out (it should represent +/-1, and it is used
- * in floating point arithmetics).
- *
- * \param [in] y device array if input labels, size [n]
- * \param [in] n number of labels
- * \param [in] y_unique device array of unique labels, size [n_classes]
- * \param [in] n_classes number of unique labels
- * \param [out] y_out device array of output labels
- * \param [in] idx index of unique label that should be labeled as 1
- * \param [in] stream cuda stream
- */
-template <typename math_t>
-void getOvrLabels(
-  math_t* y, int n, math_t* y_unique, int n_classes, math_t* y_out, int idx, cudaStream_t stream)
-{
-  ASSERT(idx < n_classes,
-         "Parameter idx should not be larger than the number "
-         "of classes");
-  raft::linalg::unaryOp(
-    y_out,
-    y,
-    n,
-    [idx, y_unique] __device__(math_t y) { return y == y_unique[idx] ? +1 : -1; },
-    stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-// TODO: add one-versus-one selection: select two classes, relabel them to
-// +/-1, return array with the new class labels and corresponding indices.
-
-template <typename Type, int TPB_X, typename Lambda>
-__global__ void map_label_kernel(
-  Type* map_ids, size_t N_labels, Type* in, Type* out, size_t N, Lambda filter_op)
-{
-  int tid = threadIdx.x + blockIdx.x * TPB_X;
-  if (tid < N) {
-    if (!filter_op(in[tid])) {
-      for (size_t i = 0; i < N_labels; i++) {
-        if (in[tid] == map_ids[i]) {
-          out[tid] = i + 1;
-          break;
-        }
-      }
-    }
-  }
-}
-
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param out the output monotonic array
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
- * @param filter_op an optional function for specifying which values
- * should have monotonically increasing labels applied to them.
- */
-template <typename Type, typename Lambda>
-int make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream, Lambda filter_op)
-{
-  static const size_t TPB_X = 256;
-  dim3 blocks(raft::ceildiv(N, TPB_X));
-  dim3 threads(TPB_X);
-
-  rmm::device_uvector<Type> unique(0, stream);
-  int n_unique = raft::label::getUniquelabels(unique, in, N, stream);
-  unique.resize(n_unique, stream);
-
-  map_label_kernel<Type, TPB_X>
-    <<<blocks, threads, 0, stream>>>(unique.data(), n_unique, in, out, N, filter_op);
-
-  return n_unique;
-}
-
-/**
- * Maps an input array containing a series of numbers into a new array
- * where numbers have been mapped to a monotonically increasing set
- * of labels. This can be useful in machine learning algorithms, for instance,
- * where a given set of labels is not taken from a monotonically increasing
- * set. This can happen if they are filtered or if only a subset of the
- * total labels are used in a dataset. This is also useful in graph algorithms
- * where a set of vertices need to be labeled in a monotonically increasing
- * order.
- * @tparam Type the numeric type of the input and output arrays
- * @tparam Lambda the type of an optional filter function, which determines
- * which items in the array to map.
- * @param out output label array with labels assigned monotonically
- * @param in input label array
- * @param N number of elements in the input array
- * @param stream cuda stream to use
- */
-template <typename Type>
-void make_monotonic(Type* out, Type* in, size_t N, cudaStream_t stream)
-{
-  make_monotonic<Type>(out, in, N, stream, [] __device__(Type val) { return false; });
-}
-
-template <typename Type>
-int make_monotonic(const raft::handle_t& handle, Type* out, Type* in, size_t N)
-{
-  return make_monotonic<Type>(
-    out, in, N, handle.get_stream(), [] __device__(Type val) { return false; });
-}
-};  // namespace Label
-};  // end namespace MLCommon
diff --git a/cpp/src_prims/label/merge_labels.cuh b/cpp/src_prims/label/merge_labels.cuh
deleted file mode 100644
index b3c25ef6b0..0000000000
--- a/cpp/src_prims/label/merge_labels.cuh
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <limits>
-#include <math.h>
-
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/interruptible.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/linalg/init.cuh>
-
-namespace MLCommon {
-namespace Label {
-
-/** Note: this is one possible implementation where we represent the label
- *  equivalence graph implicitly using labels_a, labels_b and mask.
- *  For an additional cost we can build the graph with edges
- *  E={(A[i], B[i]) | M[i]=1} and make this step faster */
-template <typename Index_, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X) propagate_label_kernel(const Index_* __restrict__ labels_a,
-                                                                const Index_* __restrict__ labels_b,
-                                                                Index_* __restrict__ R,
-                                                                const bool* __restrict__ mask,
-                                                                bool* __restrict__ m,
-                                                                Index_ N)
-{
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
-  if (tid < N) {
-    if (__ldg((char*)mask + tid)) {
-      // Note: labels are from 1 to N
-      Index_ la = __ldg(labels_a + tid) - 1;
-      Index_ lb = __ldg(labels_b + tid) - 1;
-      Index_ ra = R[la];
-      Index_ rb = R[lb];
-      if (ra != rb) {
-        *m = true;
-        // min(ra, rb) would be sufficient but this speeds up convergence
-        Index_ rmin = R[min(ra, rb)];
-        if (sizeof(Index_) == 4) {
-          atomicMin((int*)(R + la), rmin);
-          atomicMin((int*)(R + lb), rmin);
-        } else if (sizeof(Index_) == 8) {
-          atomicMin((long long int*)(R + la), rmin);
-          atomicMin((long long int*)(R + lb), rmin);
-        }
-      }
-    }
-  }
-}
-
-template <typename Index_, int TPB_X = 256>
-__global__ void __launch_bounds__(TPB_X) reassign_label_kernel(Index_* __restrict__ labels_a,
-                                                               const Index_* __restrict__ labels_b,
-                                                               const Index_* __restrict__ R,
-                                                               Index_ N,
-                                                               Index_ MAX_LABEL)
-{
-  Index_ tid = threadIdx.x + blockIdx.x * TPB_X;
-  if (tid < N) {
-    // Note: labels are from 1 to N
-    Index_ la     = labels_a[tid];
-    Index_ lb     = __ldg(labels_b + tid);
-    Index_ ra     = (la == MAX_LABEL) ? MAX_LABEL : __ldg(R + (la - 1)) + 1;
-    Index_ rb     = (lb == MAX_LABEL) ? MAX_LABEL : __ldg(R + (lb - 1)) + 1;
-    labels_a[tid] = min(ra, rb);
-  }
-}
-
-/**
- * @brief Merge two labellings in-place, according to a core mask
- *
- * A labelling is a representation of disjoint sets (groups) where points that
- * belong to the same group have the same label. It is assumed that group
- * labels take values between 1 and N. Labels relate to points, i.e a label i+1
- * means that you belong to the same group as the point i.
- * The special value MAX_LABEL is used to mark points that are not labelled.
- *
- * The two label arrays A and B induce two sets of groups over points 0..N-1.
- * If a point is labelled i in A and j in B and the mask is true for this
- * point, then i and j are equivalent labels and their groups are merged by
- * relabeling the elements of both groups to have the same label. The new label
- * is the smaller one from the original labels.
- * It is required that if the mask is true for a point, this point is labelled
- * (i.e its label is different than the special value MAX_LABEL).
- *
- * One use case is finding connected components: the two input label arrays can
- * represent the connected components of graphs G_A and G_B, and the output
- * would be the connected components labels of G_A \union G_B.
- *
- * @param[inout] labels_a    First input, and output label array (in-place)
- * @param[in]    labels_b    Second input label array
- * @param[in]    mask        Core point mask
- * @param[out]   R           Label equivalence map
- * @param[in]    m           Working flag
- * @param[in]    N           Number of points in the dataset
- * @param[in]    stream      CUDA stream
- */
-template <typename Index_ = int, int TPB_X = 256>
-void merge_labels(Index_* labels_a,
-                  const Index_* labels_b,
-                  const bool* mask,
-                  Index_* R,
-                  bool* m,
-                  Index_ N,
-                  cudaStream_t stream)
-{
-  dim3 blocks(raft::ceildiv(N, Index_(TPB_X)));
-  dim3 threads(TPB_X);
-  Index_ MAX_LABEL = std::numeric_limits<Index_>::max();
-
-  // Initialize R. R defines the relabeling rules; after merging the input
-  // arrays, label l will be reassigned as R[l-1]+1.
-  raft::linalg::range(R, N, stream);
-
-  // We define the label equivalence graph: G = (V, E), where:
-  //  - V is the set of unique values from labels_a and labels_b
-  //  - E = {(labels_a[k], labels_b[k]) | mask[k] == true and k \in 0..n-1 }
-  // The edges connect groups from the two labellings. Only points with true
-  // mask can induce connection between groups.
-
-  // Step 1: compute connected components in the label equivalence graph
-  bool host_m;
-  do {
-    RAFT_CUDA_TRY(cudaMemsetAsync(m, false, sizeof(bool), stream));
-
-    propagate_label_kernel<Index_, TPB_X>
-      <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, mask, m, N);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-    raft::update_host(&host_m, m, 1, stream);
-    raft::interruptible::synchronize(stream);
-  } while (host_m);
-
-  // Step 2: re-assign minimum equivalent label
-  reassign_label_kernel<Index_, TPB_X>
-    <<<blocks, threads, 0, stream>>>(labels_a, labels_b, R, N, MAX_LABEL);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
-
-};  // namespace Label
-};  // namespace MLCommon
diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh
index bf082d1ecc..cd19e20d91 100644
--- a/cpp/src_prims/selection/knn.cuh
+++ b/cpp/src_prims/selection/knn.cuh
@@ -16,14 +16,14 @@
 
 #pragma once
 
-#include <label/classlabels.cuh>
+#include <raft/label/classlabels.cuh>
 
 #include <cuml/neighbors/knn.hpp>
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.cuh>
 #include <raft/distance/distance_type.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 #include <faiss/gpu/GpuDistance.h>
 #include <faiss/gpu/GpuIndexFlat.h>
@@ -200,7 +200,7 @@ void class_probs(const raft::handle_t& handle,
     raft::update_device(y_tmp.data(), y[i], n_index_rows, stream);
     raft::update_device(y_tmp.data() + n_index_rows, uniq_labels[i], n_unique_labels, stream);
 
-    MLCommon::Label::make_monotonic(y_normalized.data(), y_tmp.data(), y_tmp.size(), stream);
+    raft::label::make_monotonic(y_normalized.data(), y_tmp.data(), y_tmp.size(), stream);
     raft::linalg::unaryOp<int>(
       y_normalized.data(),
       y_normalized.data(),
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 4082442c35..79f495ffb0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -244,13 +244,11 @@ if(BUILD_PRIMS_TESTS)
   ConfigureTest(PREFIX PRIMS NAME KNN_CLASSIFY_TEST PATH prims/knn_classify.cu)
   ConfigureTest(PREFIX PRIMS NAME KNN_REGRESSION_TEST PATH prims/knn_regression.cu)
   ConfigureTest(PREFIX PRIMS NAME KSELECTION_TEST PATH prims/kselection.cu)
-  ConfigureTest(PREFIX PRIMS NAME LABEL_TEST PATH prims/label.cu)
   ConfigureTest(PREFIX PRIMS NAME LINALG_BLOCK_TEST PATH prims/linalg_block.cu)
   ConfigureTest(PREFIX PRIMS NAME LINEARREG_TEST PATH prims/linearReg.cu)
   ConfigureTest(PREFIX PRIMS NAME LOG_TEST PATH prims/log.cu)
   ConfigureTest(PREFIX PRIMS NAME LOGISTICREG_TEST PATH prims/logisticReg.cu)
   ConfigureTest(PREFIX PRIMS NAME MAKE_ARIMA_TEST PATH prims/make_arima.cu)
-  ConfigureTest(PREFIX PRIMS NAME MERGE_LABELS_TEST PATH prims/merge_labels.cu)
   ConfigureTest(PREFIX PRIMS NAME PENALTY_TEST PATH prims/penalty.cu)
   ConfigureTest(PREFIX PRIMS NAME REVERSE_TEST PATH prims/reverse.cu)
   ConfigureTest(PREFIX PRIMS NAME SIGMOID_TEST PATH prims/sigmoid.cu)
diff --git a/cpp/test/prims/cache.cu b/cpp/test/prims/cache.cu
deleted file mode 100644
index 31b1603c73..0000000000
--- a/cpp/test/prims/cache.cu
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <cache/cache.cuh>
-#include <gtest/gtest.h>
-#include <iostream>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/interruptible.hpp>
-#include <raft/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Cache {
-
-class CacheTest : public ::testing::Test {
- protected:
-  CacheTest()
-    : x_dev(0, stream),
-      tile_dev(0, stream),
-      keys_dev(0, stream),
-      is_cached(0, stream),
-      cache_idx_dev(0, stream),
-      zeroone_dev(0, stream),
-      int_array_dev(0, stream),
-      argfirst_dev(0, stream)
-  {
-  }
-
-  void SetUp() override
-  {
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-    x_dev.resize(n_rows * n_cols, stream);
-    raft::update_device(x_dev.data(), x_host, n_rows * n_cols, stream);
-    tile_dev.resize(n_rows * n_cols, stream);
-
-    keys_dev.resize(n, stream);
-    is_cached.resize(n, stream);
-    cache_idx_dev.resize(n, stream);
-    raft::update_device(keys_dev.data(), keys_host, n, stream);
-    zeroone_dev.resize(n, stream);
-    int_array_dev.resize(12, stream);
-    raft::update_device(zeroone_dev.data(), zeroone_host, n, stream);
-    argfirst_dev.resize(n_rows, stream);
-  }
-
-  int n_rows = 10;
-  int n_cols = 2;
-  int n      = 10;
-
-  rmm::device_uvector<float> x_dev;
-  rmm::device_uvector<int> keys_dev;
-  rmm::device_uvector<int> cache_idx_dev;
-  rmm::device_uvector<int> int_array_dev;
-  float x_host[20] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20};
-
-  rmm::device_uvector<float> tile_dev;
-
-  int keys_host[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-
-  int zeroone_host[10] = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
-  rmm::device_uvector<int> zeroone_dev;
-
-  rmm::device_uvector<int> argfirst_dev;
-
-  cudaStream_t stream = 0;
-
-  rmm::device_uvector<bool> is_cached;
-};
-
-__global__ void test_argfirst(const int* array, int n, int* res)
-{
-  int k  = threadIdx.x;
-  res[k] = arg_first_ge(array, n, k);
-}
-
-TEST_F(CacheTest, TestArgFirst)
-{
-  int argfirst_host[10] = {0, 1, 1, 1, 2, 2, 4, 4, 6, 7};
-  raft::update_device(argfirst_dev.data(), argfirst_host, 10, stream);
-
-  test_argfirst<<<1, 10>>>(argfirst_dev.data(), 10, int_array_dev.data());
-  int idx_exp[10] = {0, 1, 4, 6, 6, 8, 8, 9, 10, 10};
-  EXPECT_TRUE(devArrMatchHost(idx_exp, int_array_dev.data(), 10, raft::Compare<int>()));
-}
-
-__global__ void test_nth_occurrence(const int* array, int n, int val, int* res)
-{
-  int k  = threadIdx.x;
-  res[k] = find_nth_occurrence(array, n, val, k);
-}
-
-TEST_F(CacheTest, TestNthOccurrence)
-{
-  test_nth_occurrence<<<1, 10>>>(zeroone_dev.data(), 10, 0, int_array_dev.data());
-  int idx_exp[10] = {0, 1, 2, 3, 4, -1, -1, -1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(idx_exp, int_array_dev.data(), 10, raft::Compare<int>()));
-  test_nth_occurrence<<<1, 10>>>(zeroone_dev.data(), 10, 1, int_array_dev.data());
-  int idx_exp2[10] = {5, 6, 7, 8, 9, -1, -1, -1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(idx_exp2, int_array_dev.data(), 10, raft::Compare<int>()));
-}
-
-template <int nthreads, int associativity>
-__global__ void test_rank_set_entries(const int* array, int n, int* res)
-{
-  const int items_per_thread = raft::ceildiv(associativity, nthreads);
-  __shared__ int rank[items_per_thread * nthreads];
-
-  rank_set_entries<nthreads, associativity>(array, n, rank);
-
-  int block_offset = blockIdx.x * associativity;
-
-  for (int i = 0; i < items_per_thread; i++) {
-    int k = threadIdx.x * items_per_thread + i;
-    if (k < associativity && block_offset + k < n) res[block_offset + k] = rank[k];
-  }
-}
-
-TEST_F(CacheTest, TestRankEntries)
-{
-  // Three cache sets, with 4 elements each
-  int val[12] = {12, 11, 10, 9, 8, 6, 7, 5, 4, 1, 2, 3};
-  raft::update_device(int_array_dev.data(), val, 12, stream);
-
-  const int nthreads = 4;
-  test_rank_set_entries<nthreads, 4>
-    <<<3, nthreads>>>(int_array_dev.data(), 12, int_array_dev.data());
-
-  // expect that each block is sorted separately
-  // the indices that sorts the block are the following
-  int idx_exp[12] = {3, 2, 1, 0, 3, 1, 2, 0, 3, 0, 1, 2};
-
-  EXPECT_TRUE(devArrMatchHost(idx_exp, int_array_dev.data(), 12, raft::Compare<int>()));
-
-  // do the same with less than 4 threads
-  const int nthreads3 = 3;
-  raft::update_device(int_array_dev.data(), val, 12, stream);
-  test_rank_set_entries<nthreads3, 4>
-    <<<3, nthreads3>>>(int_array_dev.data(), 12, int_array_dev.data());
-  EXPECT_TRUE(devArrMatchHost(idx_exp, int_array_dev.data(), 12, raft::Compare<int>()));
-}
-
-TEST_F(CacheTest, TestSimple)
-{
-  float cache_size = 5 * sizeof(float) * n_cols / (1024 * 1024.0);
-  Cache<float, 2> cache(stream, n_cols, cache_size);
-
-  ASSERT_EQ(cache.GetSize(), 4);
-
-  cache.GetCacheIdx(keys_dev.data(), n, cache_idx_dev.data(), is_cached.data(), stream);
-  EXPECT_TRUE(devArrMatch(false, is_cached.data(), n, raft::Compare<bool>()));
-
-  int cache_set[10] = {0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
-  EXPECT_TRUE(devArrMatchHost(cache_set, cache_idx_dev.data(), n, raft::Compare<int>()));
-  int n_cached = 1;
-  cache.GetCacheIdxPartitioned(keys_dev.data(), n, cache_idx_dev.data(), &n_cached, stream);
-  EXPECT_EQ(n_cached, 0);
-}
-
-TEST_F(CacheTest, TestAssignCacheIdx)
-{
-  float cache_size = 5 * sizeof(float) * n_cols / (1024 * 1024.0);
-  Cache<float, 2> cache(stream, n_cols, cache_size);
-
-  ASSERT_EQ(cache.GetSize(), 4);
-
-  int n_cached;
-  cache.GetCacheIdxPartitioned(keys_dev.data(), n, cache_idx_dev.data(), &n_cached, stream);
-
-  cache.AssignCacheIdx(keys_dev.data(), n, cache_idx_dev.data(), stream);
-
-  int cache_idx_exp[10] = {0, 1, -1, -1, -1, 2, 3, -1, -1, -1};
-  int keys_exp[10]      = {8, 6, 4, 2, 0, 9, 7, 5, 3, 1};
-  EXPECT_TRUE(devArrMatchHost(cache_idx_exp, cache_idx_dev.data(), n, raft::Compare<int>()));
-  EXPECT_TRUE(devArrMatchHost(keys_exp, keys_dev.data(), n, raft::Compare<int>()));
-
-  // Now the elements that have been assigned a cache slot are considered cached
-  // A subsequent cache lookup should give us their cache indices.
-  raft::update_device(keys_dev.data(), keys_host, n, stream);
-  cache.GetCacheIdxPartitioned(keys_dev.data(), n, cache_idx_dev.data(), &n_cached, stream);
-  ASSERT_EQ(n_cached, 4);
-
-  int keys_exp2[4] = {6, 7, 8, 9};
-  EXPECT_TRUE(devArrMatchHost(keys_exp2, keys_dev.data(), n_cached, raft::Compare<int>()));
-  int cache_idx_exp2[4] = {1, 3, 0, 2};
-  EXPECT_TRUE(
-    devArrMatchHost(cache_idx_exp2, cache_idx_dev.data(), n_cached, raft::Compare<int>()));
-
-  // Find cache slots, when not available
-  int non_cached = n - n_cached;
-  cache.AssignCacheIdx(
-    keys_dev.data() + n_cached, non_cached, cache_idx_dev.data() + n_cached, stream);
-
-  int cache_idx_exp3[6] = {-1, -1, -1, -1, -1, -1};
-  EXPECT_TRUE(devArrMatchHost(
-    cache_idx_exp3, cache_idx_dev.data() + n_cached, non_cached, raft::Compare<int>()));
-}
-
-TEST_F(CacheTest, TestEvict)
-{
-  float cache_size = 8 * sizeof(float) * n_cols / (1024 * 1024.0);
-  Cache<float, 4> cache(stream, n_cols, cache_size);
-
-  ASSERT_EQ(cache.GetSize(), 8);
-
-  int n_cached;
-  cache.GetCacheIdxPartitioned(keys_dev.data(), 5, cache_idx_dev.data(), &n_cached, stream);
-  ASSERT_EQ(n_cached, 0);
-  cache.AssignCacheIdx(keys_dev.data(), 5, cache_idx_dev.data(), stream);
-
-  int cache_idx_exp[5] = {0, 1, 2, 4, 5};
-  int keys_exp[5]      = {4, 2, 0, 3, 1};
-  EXPECT_TRUE(devArrMatchHost(cache_idx_exp, cache_idx_dev.data(), 5, raft::Compare<int>()));
-  EXPECT_TRUE(devArrMatchHost(keys_exp, keys_dev.data(), 5, raft::Compare<int>()));
-
-  int idx_host[10] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  raft::update_device(keys_dev.data(), idx_host, 10, stream);
-  cache.GetCacheIdxPartitioned(keys_dev.data(), 10, cache_idx_dev.data(), &n_cached, stream);
-  EXPECT_EQ(n_cached, 3);
-  int cache_idx_exp2[3] = {1, 4, 0};
-  EXPECT_TRUE(devArrMatchHost(cache_idx_exp2, cache_idx_dev.data(), 3, raft::Compare<int>()));
-
-  cache.AssignCacheIdx(
-    keys_dev.data() + n_cached, 10 - n_cached, cache_idx_dev.data() + n_cached, stream);
-
-  int keys_exp3[10]      = {2, 3, 4, 10, 8, 6, 11, 9, 7, 5};
-  int cache_idx_exp3[10] = {1, 4, 0, 3, 2, -1, 6, 7, 5, -1};
-  EXPECT_TRUE(devArrMatchHost(keys_exp3, keys_dev.data(), 10, raft::Compare<int>()));
-  EXPECT_TRUE(devArrMatchHost(cache_idx_exp3, cache_idx_dev.data(), 10, raft::Compare<int>()));
-}
-
-TEST_F(CacheTest, TestStoreCollect)
-{
-  float cache_size = 8 * sizeof(float) * n_cols / (1024 * 1024.0);
-  Cache<float, 4> cache(stream, n_cols, cache_size);
-
-  ASSERT_EQ(cache.GetSize(), 8);
-
-  int n_cached;
-
-  cache.GetCacheIdxPartitioned(keys_dev.data(), 5, cache_idx_dev.data(), &n_cached, stream);
-  cache.AssignCacheIdx(keys_dev.data(), 5, cache_idx_dev.data(), stream);
-  cache.GetCacheIdxPartitioned(keys_dev.data(), 5, cache_idx_dev.data(), &n_cached, stream);
-
-  cache.StoreVecs(x_dev.data(), 10, n_cached, cache_idx_dev.data(), stream, keys_dev.data());
-  cache.GetCacheIdxPartitioned(keys_dev.data(), 5, cache_idx_dev.data(), &n_cached, stream);
-  cache.GetVecs(cache_idx_dev.data(), n_cached, tile_dev.data(), stream);
-
-  int cache_idx_host[10];
-  raft::update_host(cache_idx_host, cache_idx_dev.data(), n_cached, stream);
-  int keys_host[10];
-  raft::update_host(keys_host, keys_dev.data(), n_cached, stream);
-  raft::interruptible::synchronize(stream);
-  for (int i = 0; i < n_cached; i++) {
-    EXPECT_TRUE(devArrMatch(x_dev.data() + keys_host[i] * n_cols,
-                            tile_dev.data() + i * n_cols,
-                            n_cols,
-                            raft::Compare<int>()))
-      << "vector " << i;
-  }
-
-  for (int k = 0; k < 4; k++) {
-    cache.GetCacheIdxPartitioned(keys_dev.data(), 10, cache_idx_dev.data(), &n_cached, stream);
-    if (k == 0) {
-      EXPECT_EQ(n_cached, 5);
-    } else {
-      EXPECT_EQ(n_cached, 8);
-    }
-
-    cache.AssignCacheIdx(
-      keys_dev.data() + n_cached, 10 - n_cached, cache_idx_dev.data() + n_cached, stream);
-    cache.StoreVecs(x_dev.data(),
-                    10,
-                    10 - n_cached,
-                    cache_idx_dev.data() + n_cached,
-                    stream,
-                    keys_dev.data() + n_cached);
-
-    cache.GetVecs(cache_idx_dev.data(), 10, tile_dev.data(), stream);
-
-    raft::update_host(cache_idx_host, cache_idx_dev.data(), 10, stream);
-    raft::update_host(keys_host, keys_dev.data(), 10, stream);
-    raft::interruptible::synchronize(stream);
-    for (int i = 0; i < 10; i++) {
-      if (cache_idx_host[i] >= 0) {
-        EXPECT_TRUE(devArrMatch(x_dev.data() + keys_host[i] * n_cols,
-                                tile_dev.data() + i * n_cols,
-                                n_cols,
-                                raft::Compare<int>()))
-          << "vector " << i;
-      }
-    }
-  }
-}
-};  // end namespace Cache
-};  // end namespace MLCommon
diff --git a/cpp/test/prims/label.cu b/cpp/test/prims/label.cu
deleted file mode 100644
index fd1ef4fd5b..0000000000
--- a/cpp/test/prims/label.cu
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include <label/classlabels.cuh>
-
-#include "test_utils.h"
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/interruptible.hpp>
-#include <raft/cuda_utils.cuh>
-#include <rmm/device_uvector.hpp>
-
-#include <iostream>
-#include <vector>
-
-namespace MLCommon {
-namespace Label {
-
-class LabelTest : public ::testing::Test {
- protected:
-  void SetUp() override {}
-  void TearDown() override {}
-};
-
-typedef LabelTest MakeMonotonicTest;
-TEST_F(MakeMonotonicTest, Result)
-{
-  cudaStream_t stream = 0;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-  int m = 12;
-
-  rmm::device_uvector<float> data(m, stream);
-  rmm::device_uvector<float> actual(m, stream);
-  rmm::device_uvector<float> expected(m, stream);
-  RAFT_CUDA_TRY(cudaMemset(data.data(), 0, data.size() * sizeof(float)));
-  RAFT_CUDA_TRY(cudaMemset(actual.data(), 0, actual.size() * sizeof(float)));
-  RAFT_CUDA_TRY(cudaMemset(expected.data(), 0, expected.size() * sizeof(float)));
-
-  float* data_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 8.0, 7.0, 8.0, 8.0, 25.0, 80.0};
-
-  float* expected_h = new float[m]{1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 5.0, 6.0, 7.0};
-
-  raft::update_device(data.data(), data_h, m, stream);
-  raft::update_device(expected.data(), expected_h, m, stream);
-
-  make_monotonic(actual.data(), data.data(), m, stream);
-
-  raft::interruptible::synchronize(stream);
-
-  ASSERT_TRUE(devArrMatch(actual.data(), expected.data(), m, raft::Compare<bool>(), stream));
-
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-
-  delete data_h;
-  delete expected_h;
-}
-
-TEST(LabelTest, ClassLabels)
-{
-  cudaStream_t stream = 0;
-  RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-  int n_rows = 6;
-  rmm::device_uvector<float> y_d(n_rows, stream);
-
-  float y_h[] = {2, -1, 1, 2, 1, 1};
-  raft::update_device(y_d.data(), y_h, n_rows, stream);
-
-  rmm::device_uvector<float> y_unique_d(n_rows, stream);
-  auto n_classes = getUniqueLabels(y_d.data(), n_rows, y_unique_d.data(), stream);
-  y_unique_d.resize(n_classes, stream);
-
-  ASSERT_EQ(n_classes, 3);
-
-  float y_unique_exp[] = {-1, 1, 2};
-  EXPECT_TRUE(
-    devArrMatchHost(y_unique_exp, y_unique_d.data(), n_classes, raft::Compare<float>(), stream));
-
-  rmm::device_uvector<float> y_relabeled_d(n_rows, stream);
-
-  getOvrLabels(y_d.data(), n_rows, y_unique_d.data(), n_classes, y_relabeled_d.data(), 2, stream);
-
-  float y_relabeled_exp[] = {1, -1, -1, 1, -1, -1};
-  EXPECT_TRUE(
-    devArrMatchHost(y_relabeled_exp, y_relabeled_d.data(), n_rows, raft::Compare<float>(), stream));
-
-  RAFT_CUDA_TRY(cudaStreamDestroy(stream));
-}
-};  // namespace Label
-};  // namespace MLCommon
diff --git a/cpp/test/prims/merge_labels.cu b/cpp/test/prims/merge_labels.cu
deleted file mode 100644
index 76e040d20d..0000000000
--- a/cpp/test/prims/merge_labels.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-#include <label/merge_labels.cuh>
-
-#include "test_utils.h"
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/handle.hpp>
-#include <rmm/device_uvector.hpp>
-#include <thrust/device_ptr.h>
-
-#include <vector>
-
-namespace MLCommon {
-namespace Label {
-
-template <typename Index_>
-struct MergeLabelsInputs {
-  Index_ N;
-  std::vector<Index_> labels_a;
-  std::vector<Index_> labels_b;
-  std::vector<uint8_t> mask;  // to avoid std::vector<bool> optimization
-  std::vector<Index_> expected;
-};
-
-template <typename Index_>
-class MergeLabelsTest : public ::testing::TestWithParam<MergeLabelsInputs<Index_>> {
- protected:
-  MergeLabelsTest()
-    : params(::testing::TestWithParam<MergeLabelsInputs<Index_>>::GetParam()),
-      stream(handle.get_stream()),
-      labels_a(params.N, stream),
-      labels_b(params.N, stream),
-      expected(params.N, stream),
-      R(params.N, stream),
-      mask(params.N, stream),
-      m(1, stream)
-  {
-  }
-
-  void Run()
-  {
-    raft::update_device(labels_a.data(), params.labels_a.data(), params.N, stream);
-    raft::update_device(labels_b.data(), params.labels_b.data(), params.N, stream);
-    raft::update_device(expected.data(), params.expected.data(), params.N, stream);
-    raft::update_device(mask.data(), reinterpret_cast<bool*>(params.mask.data()), params.N, stream);
-
-    merge_labels(
-      labels_a.data(), labels_b.data(), mask.data(), R.data(), m.data(), params.N, stream);
-
-    cudaStreamSynchronize(stream);
-    ASSERT_TRUE(raft::devArrMatch<Index_>(
-      expected.data(), labels_a.data(), params.N, raft::Compare<Index_>()));
-  }
-
- protected:
-  MergeLabelsInputs<Index_> params;
-  raft::handle_t handle;
-  cudaStream_t stream = 0;
-  rmm::device_uvector<Index_> labels_a, labels_b, expected, R;
-  rmm::device_uvector<bool> mask, m;
-};
-
-using MergeLabelsTestI = MergeLabelsTest<int>;
-TEST_P(MergeLabelsTestI, Result) { Run(); }
-
-using MergeLabelsTestL = MergeLabelsTest<int64_t>;
-TEST_P(MergeLabelsTestL, Result) { Run(); }
-
-constexpr int MAX32     = std::numeric_limits<int>::max();
-constexpr int64_t MAX64 = std::numeric_limits<int64_t>::max();
-
-const std::vector<MergeLabelsInputs<int>> merge_inputs_32 = {
-  {4, {1, 1, 3, MAX32}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
-  {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6, {1, 2, 1, 4, 5, MAX32}, {1, 2, MAX32, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
-  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
-  {8,
-   {1, 1, 3, 3, MAX32, 1, 3, MAX32},
-   {1, 2, 3, 2, MAX32, 2, 2, 2},
-   {1, 1, 1, 1, 0, 1, 1, 0},
-   {1, 1, 1, 1, MAX32, 1, 1, 1}},
-  {8,
-   {1, 1, 1, 4, 4, 7, 7, 8},
-   {1, 2, 2, 2, 2, 7, 7, 7},
-   {1, 1, 1, 1, 0, 0, 1, 1},
-   {1, 1, 1, 1, 1, 7, 7, 7}},
-};
-
-const std::vector<MergeLabelsInputs<int64_t>> merge_inputs_64 = {
-  {4, {1, 1, 3, MAX64}, {1, 3, 3, 1}, {1, 0, 1, 0}, {1, 1, 3, 1}},
-  {5, {1, 2, 2, 2, 1}, {4, 2, 4, 4, 4}, {1, 1, 1, 1, 1}, {1, 1, 1, 1, 1}},
-  {6, {1, 2, 1, 4, 5, MAX64}, {1, 2, MAX64, 4, 5, 4}, {1, 1, 0, 1, 1, 0}, {1, 2, 1, 4, 5, 4}},
-  {6, {1, 2, 2, 2, 2, 6}, {1, 1, 1, 5, 5, 5}, {1, 1, 1, 1, 1, 1}, {1, 1, 1, 1, 1, 1}},
-  {8,
-   {1, 1, 3, 3, MAX64, 1, 3, MAX64},
-   {1, 2, 3, 2, MAX64, 2, 2, 2},
-   {1, 1, 1, 1, 0, 1, 1, 0},
-   {1, 1, 1, 1, MAX64, 1, 1, 1}},
-  {8,
-   {1, 1, 1, 4, 4, 7, 7, 8},
-   {1, 2, 2, 2, 2, 7, 7, 7},
-   {1, 1, 1, 1, 0, 0, 1, 1},
-   {1, 1, 1, 1, 1, 7, 7, 7}},
-};
-
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestI, ::testing::ValuesIn(merge_inputs_32));
-INSTANTIATE_TEST_CASE_P(MergeLabelsTests, MergeLabelsTestL, ::testing::ValuesIn(merge_inputs_64));
-
-}  // namespace Label
-}  // namespace MLCommon
diff --git a/cpp/test/sg/svc_test.cu b/cpp/test/sg/svc_test.cu
index bbe7c55b3e..a43be586d2 100644
--- a/cpp/test/sg/svc_test.cu
+++ b/cpp/test/sg/svc_test.cu
@@ -53,7 +53,6 @@
 namespace ML {
 namespace SVM {
 using namespace raft::distance::kernels;
-using namespace Matrix;
 
 // Initialize device vector C_vec with scalar C
 template <typename math_t>
@@ -699,7 +698,7 @@ class SmoSolverTest : public ::testing::Test {
  protected:
   void SetUp() override
   {
-    LinAlg::range(sample_weights_dev.data(), 1, n_rows + 1, stream);
+    raft::linalg::range(sample_weights_dev.data(), 1, n_rows + 1, stream);
 
     raft::update_device(x_dev.data(), x_host, n_rows * n_cols, stream);
     raft::update_device(ws_idx_dev.data(), ws_idx_host, n_ws, stream);
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index 4a32922af6..3cdf8c08ae 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -26,12 +26,13 @@
 #include <datasets/digits.h>
 #include <gtest/gtest.h>
 #include <iostream>
-<<<<<<< HEAD
-#include <raft/cudart_utils.h>
+#include <raft/util/cudart_utils.h>
+
+#if defined RAFT_DISTANCE_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
-=======
-#include <raft/core/cudart_utils.hpp>
->>>>>>> branch-22.10
+#endif
+
+#include <raft/util/cudart_utils.hpp>
 #include <stdio.h>
 #include <stdlib.h>
 #include <thrust/reduce.h>
diff --git a/cpp/test/sg/umap_parametrizable_test.cu b/cpp/test/sg/umap_parametrizable_test.cu
index 56cb3e1e8d..3cb34c6080 100644
--- a/cpp/test/sg/umap_parametrizable_test.cu
+++ b/cpp/test/sg/umap_parametrizable_test.cu
@@ -24,23 +24,21 @@
 #include <cuml/metrics/metrics.hpp>
 #include <cuml/neighbors/knn.hpp>
 #include <datasets/digits.h>
-<<<<<<< HEAD
-#include <raft/cudart_utils.h>
+
+#if defined RAFT_NN_COMPILED
 #include <raft/spatial/knn/specializations.hpp>
-=======
-#include <raft/core/cudart_utils.hpp>
->>>>>>> branch-22.10
+#endif
+
 #include <test_utils.h>
 
 #include <datasets/digits.h>
 #include <raft/linalg/reduce_rows_by_key.cuh>
-#include <selection/knn.cuh>
+#include <raft/spatial/knn/knn.cuh>
 
-#include <raft/core/cudart_utils.hpp>
 #include <raft/core/handle.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/distance/distance.cuh>
-#include <selection/knn.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <umap/runner.cuh>
 
 #include <gtest/gtest.h>

From a7ee4a80bc7db4062e9698efa97d17367ffa2149 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Sun, 16 Oct 2022 14:20:13 -0400
Subject: [PATCH 34/38] Final cleanup

---
 cpp/src/dbscan/runner.cuh        |   4 -
 cpp/src/solver/lars_impl.cuh     |   8 +-
 cpp/src/svm/svr_impl.cuh         |   1 -
 cpp/test/CMakeLists.txt          |   1 -
 cpp/test/prims/gather.cu         | 155 -------------------------------
 cpp/test/prims/knn_classify.cu   |   4 +-
 cpp/test/prims/knn_regression.cu |   6 +-
 cpp/test/sg/tsne_test.cu         |   1 -
 8 files changed, 9 insertions(+), 171 deletions(-)
 delete mode 100644 cpp/test/prims/gather.cu

diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index d4c6100361..284009650d 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -32,8 +32,6 @@
 
 #include <raft/core/nvtx.hpp>
 
-#include <label/classlabels.cuh>
-
 #include <thrust/copy.h>
 #include <thrust/device_ptr.h>
 #include <thrust/fill.h>
@@ -44,8 +42,6 @@
 namespace ML {
 namespace Dbscan {
 
-using namespace MLCommon;
-
 static const int TPB = 256;
 
 /**
diff --git a/cpp/src/solver/lars_impl.cuh b/cpp/src/solver/lars_impl.cuh
index 42767aa6c7..3bf30c9785 100644
--- a/cpp/src/solver/lars_impl.cuh
+++ b/cpp/src/solver/lars_impl.cuh
@@ -21,13 +21,13 @@
 #include <numeric>
 #include <vector>
 
-#include <cache/cache_util.cuh>
 #include <cub/cub.cuh>
 #include <cuml/common/logger.hpp>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/add.cuh>
 #include <raft/linalg/cholesky_r1_update.cuh>
+#include <raft/util/cache_util.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 // #TODO: Replace with public header when ready
 #include <raft/linalg/detail/cublas_wrappers.hpp>
 #include <raft/linalg/gemv.cuh>
@@ -1117,7 +1117,7 @@ void larsPredict(const raft::handle_t& handle,
     // We collect active columns of X to contiguous space
     X_active_cols.resize(n_active * ld_X, stream);
     const int TPB = 64;
-    MLCommon::Cache::get_vecs<<<raft::ceildiv(n_active * ld_X, TPB), TPB, 0, stream>>>(
+    raft::cache::get_vecs<<<raft::ceildiv(n_active * ld_X, TPB), TPB, 0, stream>>>(
       X, ld_X, active_idx, n_active, X_active_cols.data());
     RAFT_CUDA_TRY(cudaGetLastError());
     X = X_active_cols.data();
diff --git a/cpp/src/svm/svr_impl.cuh b/cpp/src/svm/svr_impl.cuh
index 735e5a7e4f..9b48e7f989 100644
--- a/cpp/src/svm/svr_impl.cuh
+++ b/cpp/src/svm/svr_impl.cuh
@@ -28,7 +28,6 @@
 #include <cublas_v2.h>
 #include <cuml/svm/svm_model.h>
 #include <cuml/svm/svm_parameter.h>
-#include <label/classlabels.cuh>
 #include <raft/distance/kernels.cuh>
 #include <raft/linalg/unary_op.cuh>
 #include <raft/matrix/matrix.cuh>
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 79f495ffb0..455df0eabd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -237,7 +237,6 @@ if(BUILD_PRIMS_TESTS)
   ConfigureTest(PREFIX PRIMS NAME ELTWISE2D_TEST PATH prims/eltwise2d.cu)
   ConfigureTest(PREFIX PRIMS NAME FAST_INT_DIV_TEST PATH prims/fast_int_div.cu)
   ConfigureTest(PREFIX PRIMS NAME FILLNA_TEST PATH prims/fillna.cu)
-  ConfigureTest(PREFIX PRIMS NAME GATHER_TEST PATH prims/gather.cu)
   ConfigureTest(PREFIX PRIMS NAME GRID_SYNC_TEST PATH prims/grid_sync.cu)
   ConfigureTest(PREFIX PRIMS NAME HINGE_TEST PATH prims/hinge.cu)
   ConfigureTest(PREFIX PRIMS NAME JONES_TRANSFORM_TEST PATH prims/jones_transform.cu)
diff --git a/cpp/test/prims/gather.cu b/cpp/test/prims/gather.cu
deleted file mode 100644
index 6c0e223b9c..0000000000
--- a/cpp/test/prims/gather.cu
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "test_utils.h"
-#include <gtest/gtest.h>
-#include <matrix/gather.cuh>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/core/interruptible.hpp>
-#include <raft/cuda_utils.cuh>
-#include <raft/random/rng.cuh>
-#include <rmm/device_uvector.hpp>
-
-namespace MLCommon {
-namespace Matrix {
-
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGatherImpl(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
-{
-  for (int outRow = 0; outRow < map_length; ++outRow) {
-    typename std::iterator_traits<MapIteratorT>::value_type map_val = map[outRow];
-    int inRowStart                                                  = map_val * D;
-    int outRowStart                                                 = outRow * D;
-    for (int i = 0; i < D; ++i) {
-      out[outRowStart + i] = in[inRowStart + i];
-    }
-  }
-}
-
-template <typename MatrixIteratorT, typename MapIteratorT>
-void naiveGather(
-  MatrixIteratorT in, int D, int N, MapIteratorT map, int map_length, MatrixIteratorT out)
-{
-  naiveGatherImpl(in, D, N, map, map_length, out);
-}
-
-template <typename MatrixIteratorT, typename MapIteratorT>
-void gatherLaunch(MatrixIteratorT in,
-                  int D,
-                  int N,
-                  MapIteratorT map,
-                  int map_length,
-                  MatrixIteratorT out,
-                  cudaStream_t stream)
-{
-  typedef typename std::iterator_traits<MapIteratorT>::value_type MapValueT;
-  Matrix::gather(in, D, N, map, map_length, out, stream);
-}
-
-struct GatherInputs {
-  uint32_t nrows;
-  uint32_t ncols;
-  uint32_t map_length;
-  unsigned long long int seed;
-};
-
-template <typename MatrixT, typename MapT>
-class GatherTest : public ::testing::TestWithParam<GatherInputs> {
- protected:
-  GatherTest() : d_in(0, stream), d_out_exp(0, stream), d_out_act(0, stream), d_map(0, stream) {}
-
-  void SetUp() override
-  {
-    params = ::testing::TestWithParam<GatherInputs>::GetParam();
-    raft::random::Rng r(params.seed);
-    raft::random::Rng r_int(params.seed);
-    RAFT_CUDA_TRY(cudaStreamCreate(&stream));
-
-    uint32_t nrows      = params.nrows;
-    uint32_t ncols      = params.ncols;
-    uint32_t map_length = params.map_length;
-    uint32_t len        = nrows * ncols;
-
-    // input matrix setup
-    d_in.resize(nrows * ncols, stream);
-    h_in.resize(nrows * ncols);
-    r.uniform(d_in.data(), len, MatrixT(-1.0), MatrixT(1.0), stream);
-    raft::update_host(h_in.data(), d_in.data(), len, stream);
-
-    // map setup
-    d_map.resize(map_length, stream);
-    h_map.resize(map_length);
-    r_int.uniformInt(d_map.data(), map_length, (MapT)0, nrows, stream);
-    raft::update_host(h_map.data(), d_map.data(), map_length, stream);
-
-    // expected and actual output matrix setup
-    h_out.resize(map_length * ncols);
-    d_out_exp.resize(map_length * ncols, stream);
-    d_out_act.resize(map_length * ncols, stream);
-
-    // launch gather on the host and copy the results to device
-    naiveGather(h_in.data(), ncols, nrows, h_map.data(), map_length, h_out.data());
-    raft::update_device(d_out_exp.data(), h_out.data(), map_length * ncols, stream);
-
-    // launch device version of the kernel
-    gatherLaunch(d_in.data(), ncols, nrows, d_map.data(), map_length, d_out_act.data(), stream);
-
-    raft::interruptible::synchronize(stream);
-  }
-  void TearDown() override { RAFT_CUDA_TRY(cudaStreamDestroy(stream)); }
-
- protected:
-  cudaStream_t stream = 0;
-  GatherInputs params;
-  std::vector<MatrixT> h_in, h_out;
-  std::vector<MapT> h_map;
-  rmm::device_uvector<MatrixT> d_in, d_out_exp, d_out_act;
-  rmm::device_uvector<MapT> d_map;
-};
-
-const std::vector<GatherInputs> inputs = {{1024, 32, 128, 1234ULL},
-                                          {1024, 32, 256, 1234ULL},
-                                          {1024, 32, 512, 1234ULL},
-                                          {1024, 32, 1024, 1234ULL},
-                                          {1024, 64, 128, 1234ULL},
-                                          {1024, 64, 256, 1234ULL},
-                                          {1024, 64, 512, 1234ULL},
-                                          {1024, 64, 1024, 1234ULL},
-                                          {1024, 128, 128, 1234ULL},
-                                          {1024, 128, 256, 1234ULL},
-                                          {1024, 128, 512, 1234ULL},
-                                          {1024, 128, 1024, 1234ULL}};
-
-typedef GatherTest<float, uint32_t> GatherTestF;
-TEST_P(GatherTestF, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<float>()));
-}
-
-typedef GatherTest<double, uint32_t> GatherTestD;
-TEST_P(GatherTestD, Result)
-{
-  ASSERT_TRUE(devArrMatch(
-    d_out_exp.data(), d_out_act.data(), params.map_length * params.ncols, raft::Compare<double>()));
-}
-
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestF, ::testing::ValuesIn(inputs));
-INSTANTIATE_TEST_CASE_P(GatherTests, GatherTestD, ::testing::ValuesIn(inputs));
-
-}  // end namespace Matrix
-}  // end namespace MLCommon
diff --git a/cpp/test/prims/knn_classify.cu b/cpp/test/prims/knn_classify.cu
index 26c879e974..21b250f207 100644
--- a/cpp/test/prims/knn_classify.cu
+++ b/cpp/test/prims/knn_classify.cu
@@ -17,11 +17,11 @@
 #include "test_utils.h"
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/label/classlabels.cuh>
 #include <raft/random/make_blobs.cuh>
 #include <raft/spatial/knn/knn.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/device_uvector.hpp>
 #include <selection/knn.cuh>
 #include <vector>
diff --git a/cpp/test/prims/knn_regression.cu b/cpp/test/prims/knn_regression.cu
index 5a57006267..e276e31839 100644
--- a/cpp/test/prims/knn_regression.cu
+++ b/cpp/test/prims/knn_regression.cu
@@ -18,13 +18,13 @@
 
 #include <gtest/gtest.h>
 
-#include <label/classlabels.cuh>
+#include <raft/label/classlabels.cuh>
 
-#include <raft/core/cudart_utils.hpp>
-#include <raft/cuda_utils.cuh>
 #include <raft/linalg/reduce.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/spatial/knn/knn.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
 
diff --git a/cpp/test/sg/tsne_test.cu b/cpp/test/sg/tsne_test.cu
index 3cdf8c08ae..e4503b2934 100644
--- a/cpp/test/sg/tsne_test.cu
+++ b/cpp/test/sg/tsne_test.cu
@@ -26,7 +26,6 @@
 #include <datasets/digits.h>
 #include <gtest/gtest.h>
 #include <iostream>
-#include <raft/util/cudart_utils.h>
 
 #if defined RAFT_DISTANCE_COMPILED
 #include <raft/spatial/knn/specializations.hpp>

From 9c0d308867252f9b187d4fe384fbd0f603c941b9 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Mon, 17 Oct 2022 21:30:38 -0400
Subject: [PATCH 35/38] Reverting get_raft.cmake

---
 cpp/cmake/thirdparty/get_raft.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 288463f6ec..59be6b9002 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -81,8 +81,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-                        FORK             cjnolet
-                        PINNED_TAG       imp-2212-move_grammatrix #branch-${CUML_BRANCH_VERSION_raft}
+                        FORK             rapidsai
+                        PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
 
                         # When PINNED_TAG above doesn't match cuml,
                         # force local raft clone in build directory

From edee19cd20291bd721c99d5118a10f9e4c42a652 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 08:22:35 -0400
Subject: [PATCH 36/38] Fixing header definition and namespace for kernel gram
 API

---
 python/cuml/svm/svm_base.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/svm/svm_base.pyx b/python/cuml/svm/svm_base.pyx
index 601c629bca..8470306c70 100644
--- a/python/cuml/svm/svm_base.pyx
+++ b/python/cuml/svm/svm_base.pyx
@@ -37,7 +37,7 @@ from cuml.common.mixins import FMajorInputTagMixin
 from libcpp cimport bool
 
 
-cdef extern from "cuml/matrix/kernelparams.h" namespace "MLCommon::Matrix":
+cdef extern from "raft/distance/distance_types.hpp" namespace "raft::distance::kernels":
     enum KernelType:
         LINEAR,
         POLYNOMIAL,

From 2a26900d9681946ae52007ca2f9c279f0725ab12 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Tue, 18 Oct 2022 09:17:26 -0400
Subject: [PATCH 37/38] Fixing style

---
 python/cuml/svm/svc.pyx      | 3 ++-
 python/cuml/svm/svm_base.pyx | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cuml/svm/svc.pyx b/python/cuml/svm/svc.pyx
index 40e0cc9b75..39d40bc55c 100644
--- a/python/cuml/svm/svc.pyx
+++ b/python/cuml/svm/svc.pyx
@@ -48,7 +48,8 @@ if has_sklearn():
     from cuml.multiclass import MulticlassClassifier
     from sklearn.calibration import CalibratedClassifierCV
 
-cdef extern from "cuml/matrix/kernelparams.h" namespace "MLCommon::Matrix":
+cdef extern from "raft/distance/distance_types.hpp" \
+        namespace "raft::distance::kernels":
     enum KernelType:
         LINEAR,
         POLYNOMIAL,
diff --git a/python/cuml/svm/svm_base.pyx b/python/cuml/svm/svm_base.pyx
index 8470306c70..f12fe0a1a9 100644
--- a/python/cuml/svm/svm_base.pyx
+++ b/python/cuml/svm/svm_base.pyx
@@ -37,7 +37,8 @@ from cuml.common.mixins import FMajorInputTagMixin
 from libcpp cimport bool
 
 
-cdef extern from "raft/distance/distance_types.hpp" namespace "raft::distance::kernels":
+cdef extern from "raft/distance/distance_types.hpp" \
+        namespace "raft::distance::kernels":
     enum KernelType:
         LINEAR,
         POLYNOMIAL,

From 977690f982c4b252e0d93945a29d05622e361bc2 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 19 Oct 2022 13:09:50 -0400
Subject: [PATCH 38/38] Fixing small error in v_measure

---
 cpp/src/metrics/v_measure.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/metrics/v_measure.cu b/cpp/src/metrics/v_measure.cu
index 75de521971..9dd97bc6f7 100644
--- a/cpp/src/metrics/v_measure.cu
+++ b/cpp/src/metrics/v_measure.cu
@@ -31,7 +31,7 @@ double v_measure(const raft::handle_t& handle,
                  double beta)
 {
   return raft::stats::v_measure(
-    y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream());
+    y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream(), beta);
 }
 }  // namespace Metrics
 }  // namespace ML