From d61595091fe720bd5628e8a0d85fed5512ee2efb Mon Sep 17 00:00:00 2001 From: "Corey J. Nolet" Date: Wed, 16 Feb 2022 13:03:57 -0500 Subject: [PATCH 01/38] Using metrics from raft --- cpp/src/metrics/accuracy_score.cu | 4 +- cpp/src/metrics/adjusted_rand_index.cu | 6 +- cpp/src/metrics/completeness_score.cu | 4 +- cpp/src/metrics/entropy.cu | 5 +- cpp/src/metrics/homogeneity_score.cu | 4 +- cpp/src/metrics/kl_divergence.cu | 6 +- cpp/src/metrics/mutual_info_score.cu | 4 +- cpp/src/metrics/r2_score.cu | 6 +- cpp/src/metrics/rand_index.cu | 4 +- cpp/src/metrics/silhouette_score.cu | 9 +- cpp/src/metrics/trustworthiness.cu | 4 +- cpp/src/metrics/v_measure.cu | 4 +- cpp/src_prims/metrics/adjusted_rand_index.cuh | 195 ------- .../metrics/batched/information_criterion.cuh | 85 ---- .../metrics/batched/silhouette_score.cuh | 278 ---------- cpp/src_prims/metrics/completeness_score.cuh | 69 --- cpp/src_prims/metrics/contingencyMatrix.cuh | 312 ------------ cpp/src_prims/metrics/dispersion.cuh | 136 ----- cpp/src_prims/metrics/entropy.cuh | 152 ------ cpp/src_prims/metrics/homogeneity_score.cuh | 70 --- cpp/src_prims/metrics/kl_divergence.cuh | 83 --- cpp/src_prims/metrics/mutual_info_score.cuh | 176 ------- cpp/src_prims/metrics/rand_index.cuh | 164 ------ cpp/src_prims/metrics/scores.cuh | 215 -------- cpp/src_prims/metrics/silhouette_score.cuh | 331 ------------ .../metrics/trustworthiness_score.cuh | 218 -------- cpp/src_prims/metrics/v_measure.cuh | 63 --- cpp/src_prims/selection/columnWiseSort.cuh | 346 ------------- cpp/src_prims/selection/haversine_knn.cuh | 136 ----- cpp/src_prims/selection/knn.cuh | 348 ------------- cpp/src_prims/selection/processing.cuh | 226 -------- cpp/test/CMakeLists.txt | 31 +- cpp/test/prims/adjusted_rand_index.cu | 201 -------- cpp/test/prims/completeness_score.cu | 136 ----- cpp/test/prims/contingencyMatrix.cu | 170 ------- cpp/test/prims/dispersion.cu | 125 ----- cpp/test/prims/entropy.cu | 118 ----- cpp/test/prims/homogeneity_score.cu | 134 ----- cpp/test/prims/kl_divergence.cu | 105 ---- cpp/test/prims/mutual_info_score.cu | 163 ------ cpp/test/prims/rand_index.cu | 129 ----- cpp/test/prims/score.cu | 481 ------------------ cpp/test/prims/silhouette_score.cu | 230 --------- cpp/test/prims/trustworthiness.cu | 335 ------------ cpp/test/prims/v_measure.cu | 139 ----- 45 files changed, 38 insertions(+), 6122 deletions(-) delete mode 100644 cpp/src_prims/metrics/adjusted_rand_index.cuh delete mode 100644 cpp/src_prims/metrics/batched/information_criterion.cuh delete mode 100644 cpp/src_prims/metrics/batched/silhouette_score.cuh delete mode 100644 cpp/src_prims/metrics/completeness_score.cuh delete mode 100644 cpp/src_prims/metrics/contingencyMatrix.cuh delete mode 100644 cpp/src_prims/metrics/dispersion.cuh delete mode 100644 cpp/src_prims/metrics/entropy.cuh delete mode 100644 cpp/src_prims/metrics/homogeneity_score.cuh delete mode 100644 cpp/src_prims/metrics/kl_divergence.cuh delete mode 100644 cpp/src_prims/metrics/mutual_info_score.cuh delete mode 100644 cpp/src_prims/metrics/rand_index.cuh delete mode 100644 cpp/src_prims/metrics/scores.cuh delete mode 100644 cpp/src_prims/metrics/silhouette_score.cuh delete mode 100644 cpp/src_prims/metrics/trustworthiness_score.cuh delete mode 100644 cpp/src_prims/metrics/v_measure.cuh delete mode 100644 cpp/src_prims/selection/columnWiseSort.cuh delete mode 100644 cpp/src_prims/selection/haversine_knn.cuh delete mode 100644 cpp/src_prims/selection/knn.cuh delete mode 100644 cpp/src_prims/selection/processing.cuh delete mode 100644 cpp/test/prims/adjusted_rand_index.cu delete mode 100644 cpp/test/prims/completeness_score.cu delete mode 100644 cpp/test/prims/contingencyMatrix.cu delete mode 100644 cpp/test/prims/dispersion.cu delete mode 100644 cpp/test/prims/entropy.cu delete mode 100644 cpp/test/prims/homogeneity_score.cu delete mode 100644 cpp/test/prims/kl_divergence.cu delete mode 100644 cpp/test/prims/mutual_info_score.cu delete mode 100644 cpp/test/prims/rand_index.cu delete mode 100644 cpp/test/prims/score.cu delete mode 100644 cpp/test/prims/silhouette_score.cu delete mode 100644 cpp/test/prims/trustworthiness.cu delete mode 100644 cpp/test/prims/v_measure.cu diff --git a/cpp/src/metrics/accuracy_score.cu b/cpp/src/metrics/accuracy_score.cu index 821cdd79e9..048d4f9047 100644 --- a/cpp/src/metrics/accuracy_score.cu +++ b/cpp/src/metrics/accuracy_score.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -27,7 +27,7 @@ float accuracy_score_py(const raft::handle_t& handle, const int* ref_predictions, int n) { - return MLCommon::Score::accuracy_score(predictions, ref_predictions, n, handle.get_stream()); + return raft::stats::accuracy_score(predictions, ref_predictions, n, handle.get_stream()); } } // namespace Metrics } // namespace ML diff --git a/cpp/src/metrics/adjusted_rand_index.cu b/cpp/src/metrics/adjusted_rand_index.cu index a3a55b3a0a..f2969663d8 100644 --- a/cpp/src/metrics/adjusted_rand_index.cu +++ b/cpp/src/metrics/adjusted_rand_index.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -26,7 +26,7 @@ double adjusted_rand_index(const raft::handle_t& handle, const int64_t* y_hat, const int64_t n) { - return MLCommon::Metrics::compute_adjusted_rand_index( + return raft::stats::adjusted_rand_index( y, y_hat, n, handle.get_stream()); } @@ -35,7 +35,7 @@ double adjusted_rand_index(const raft::handle_t& handle, const int* y_hat, const int n) { - return MLCommon::Metrics::compute_adjusted_rand_index( + return raft::stats::adjusted_rand_index( y, y_hat, n, handle.get_stream()); } } // namespace Metrics diff --git a/cpp/src/metrics/completeness_score.cu b/cpp/src/metrics/completeness_score.cu index b7b95a05e7..786be71387 100644 --- a/cpp/src/metrics/completeness_score.cu +++ b/cpp/src/metrics/completeness_score.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -29,7 +29,7 @@ double completeness_score(const raft::handle_t& handle, const int lower_class_range, const int upper_class_range) { - return MLCommon::Metrics::homogeneity_score( + return raft::stats::homogeneity_score( y_hat, y, n, lower_class_range, upper_class_range, handle.get_stream()); } diff --git a/cpp/src/metrics/entropy.cu b/cpp/src/metrics/entropy.cu index 1935c427aa..a82ff1df9d 100644 --- a/cpp/src/metrics/entropy.cu +++ b/cpp/src/metrics/entropy.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -27,8 +27,7 @@ double entropy(const raft::handle_t& handle, const int lower_class_range, const int upper_class_range) { - return MLCommon::Metrics::entropy( - y, n, lower_class_range, upper_class_range, handle.get_stream()); + return raft::stats::entropy(y, n, lower_class_range, upper_class_range, handle.get_stream()); } } // namespace Metrics } // namespace ML diff --git a/cpp/src/metrics/homogeneity_score.cu b/cpp/src/metrics/homogeneity_score.cu index 3f2b231bf2..f902834821 100644 --- a/cpp/src/metrics/homogeneity_score.cu +++ b/cpp/src/metrics/homogeneity_score.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -29,7 +29,7 @@ double homogeneity_score(const raft::handle_t& handle, const int lower_class_range, const int upper_class_range) { - return MLCommon::Metrics::homogeneity_score( + return raft::stats::homogeneity_score( y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream()); } } // namespace Metrics diff --git a/cpp/src/metrics/kl_divergence.cu b/cpp/src/metrics/kl_divergence.cu index f4c9ad6047..7e80f01c6a 100644 --- a/cpp/src/metrics/kl_divergence.cu +++ b/cpp/src/metrics/kl_divergence.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -24,12 +24,12 @@ namespace Metrics { double kl_divergence(const raft::handle_t& handle, const double* y, const double* y_hat, int n) { - return MLCommon::Metrics::kl_divergence(y, y_hat, n, handle.get_stream()); + return raft::stats::kl_divergence(y, y_hat, n, handle.get_stream()); } float kl_divergence(const raft::handle_t& handle, const float* y, const float* y_hat, int n) { - return MLCommon::Metrics::kl_divergence(y, y_hat, n, handle.get_stream()); + return raft::stats::kl_divergence(y, y_hat, n, handle.get_stream()); } } // namespace Metrics } // namespace ML diff --git a/cpp/src/metrics/mutual_info_score.cu b/cpp/src/metrics/mutual_info_score.cu index 1c2cf4c2a3..3b98654907 100644 --- a/cpp/src/metrics/mutual_info_score.cu +++ b/cpp/src/metrics/mutual_info_score.cu @@ -18,7 +18,7 @@ #include #include -#include +#include namespace ML { @@ -31,7 +31,7 @@ double mutual_info_score(const raft::handle_t& handle, const int lower_class_range, const int upper_class_range) { - return MLCommon::Metrics::mutual_info_score( + return raft::stats::mutual_info_score( y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream()); } diff --git a/cpp/src/metrics/r2_score.cu b/cpp/src/metrics/r2_score.cu index 402f8e8606..ce3f99fb02 100644 --- a/cpp/src/metrics/r2_score.cu +++ b/cpp/src/metrics/r2_score.cu @@ -15,7 +15,7 @@ */ #include -#include +#include namespace ML { @@ -23,12 +23,12 @@ namespace Metrics { float r2_score_py(const raft::handle_t& handle, float* y, float* y_hat, int n) { - return MLCommon::Score::r2_score(y, y_hat, n, handle.get_stream()); + return raft::stats::r2_score(y, y_hat, n, handle.get_stream()); } double r2_score_py(const raft::handle_t& handle, double* y, double* y_hat, int n) { - return MLCommon::Score::r2_score(y, y_hat, n, handle.get_stream()); + return raft::stats::r2_score(y, y_hat, n, handle.get_stream()); } } // namespace Metrics diff --git a/cpp/src/metrics/rand_index.cu b/cpp/src/metrics/rand_index.cu index 021b0e1b28..8cc4af3ff8 100644 --- a/cpp/src/metrics/rand_index.cu +++ b/cpp/src/metrics/rand_index.cu @@ -18,7 +18,7 @@ #include #include -#include +#include namespace ML { @@ -26,7 +26,7 @@ namespace Metrics { double rand_index(const raft::handle_t& handle, const double* y, const double* y_hat, int n) { - return MLCommon::Metrics::compute_rand_index(y, y_hat, (uint64_t)n, handle.get_stream()); + return raft::stats::rand_index(y, y_hat, (uint64_t)n, handle.get_stream()); } } // namespace Metrics } // namespace ML diff --git a/cpp/src/metrics/silhouette_score.cu b/cpp/src/metrics/silhouette_score.cu index c80fe099f1..d9c4812274 100644 --- a/cpp/src/metrics/silhouette_score.cu +++ b/cpp/src/metrics/silhouette_score.cu @@ -16,9 +16,8 @@ */ #include -#include -#include #include +#include namespace ML { @@ -32,7 +31,7 @@ double silhouette_score(const raft::handle_t& handle, double* silScores, raft::distance::DistanceType metric) { - return MLCommon::Metrics::silhouette_score( + return raft::stats::silhouette_score( handle, y, nRows, nCols, labels, nLabels, silScores, handle.get_stream(), metric); } @@ -48,7 +47,7 @@ float silhouette_score(const raft::handle_t& handle, int chunk, raft::distance::DistanceType metric) { - return MLCommon::Metrics::Batched::silhouette_score( + return raft::stats::Batched::silhouette_score_batched( handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric); } @@ -62,7 +61,7 @@ double silhouette_score(const raft::handle_t& handle, int chunk, raft::distance::DistanceType metric) { - return MLCommon::Metrics::Batched::silhouette_score( + return raft::stats::silhouette_score_batched( handle, X, n_rows, n_cols, y, n_labels, scores, chunk, metric); } diff --git a/cpp/src/metrics/trustworthiness.cu b/cpp/src/metrics/trustworthiness.cu index e97776df47..c3c4a644df 100644 --- a/cpp/src/metrics/trustworthiness.cu +++ b/cpp/src/metrics/trustworthiness.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include @@ -50,7 +50,7 @@ double trustworthiness_score(const raft::handle_t& h, int n_neighbors, int batchSize) { - return MLCommon::Score::trustworthiness_score( + return raft::stats::trustworthiness_score( h, X, X_embedded, n, m, d, n_neighbors, batchSize); } diff --git a/cpp/src/metrics/v_measure.cu b/cpp/src/metrics/v_measure.cu index f71091543a..a979e988fc 100644 --- a/cpp/src/metrics/v_measure.cu +++ b/cpp/src/metrics/v_measure.cu @@ -16,7 +16,7 @@ */ #include -#include +#include namespace ML { @@ -29,7 +29,7 @@ double v_measure(const raft::handle_t& handle, const int lower_class_range, const int upper_class_range) { - return MLCommon::Metrics::v_measure( + return raft::stats::v_measure( y, y_hat, n, lower_class_range, upper_class_range, handle.get_stream()); } } // namespace Metrics diff --git a/cpp/src_prims/metrics/adjusted_rand_index.cuh b/cpp/src_prims/metrics/adjusted_rand_index.cuh deleted file mode 100644 index 3f0780e656..0000000000 --- a/cpp/src_prims/metrics/adjusted_rand_index.cuh +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file adjusted_rand_index.cuh - * @brief The adjusted Rand index is the corrected-for-chance version of the Rand index. - * Such a correction for chance establishes a baseline by using the expected similarity - * of all pair-wise comparisons between clusterings specified by a random model. - */ - -#pragma once - -#include "contingencyMatrix.cuh" -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace MLCommon { -namespace Metrics { - -/** - * @brief Lambda to calculate the number of unordered pairs in a given input - * - * @tparam Type: Data type of the input - * @param in: the input to the functional mapping - * @param i: the indexing(not used in this case) - */ -template -struct nCTwo { - HDI Type operator()(Type in, int i = 0) - { - return in % 2 ? ((in - 1) >> 1) * in : (in >> 1) * (in - 1); - } -}; - -template -struct Binner { - Binner(DataT minL) : minLabel(minL) {} - - DI int operator()(DataT val, IdxT row, IdxT col) { return int(val - minLabel); } - - private: - DataT minLabel; -}; // struct Binner - -/** - * @brief Function to count the number of unique elements in the input array - * - * @tparam T data-type for input arrays - * - * @param[in] arr input array [on device] [len = size] - * @param[in] size the size of the input array - * @param[out] minLabel the lower bound of the range of labels - * @param[out] maxLabel the upper bound of the range of labels - * @param[in] stream cuda stream - * - * @return the number of unique elements in the array - */ -template -int countUnique(const T* arr, int size, T& minLabel, T& maxLabel, cudaStream_t stream) -{ - auto ptr = thrust::device_pointer_cast(arr); - auto minmax = thrust::minmax_element(thrust::cuda::par.on(stream), ptr, ptr + size); - minLabel = *minmax.first; - maxLabel = *minmax.second; - auto totalLabels = int(maxLabel - minLabel + 1); - rmm::device_uvector labelCounts(totalLabels, stream); - rmm::device_scalar nUniq(stream); - raft::stats::histogram( - raft::stats::HistTypeAuto, - labelCounts.data(), - totalLabels, - arr, - size, - 1, - stream, - [minLabel] __device__(T val, int row, int col) { return int(val - minLabel); }); - raft::linalg::mapThenSumReduce( - nUniq.data(), - totalLabels, - [] __device__(const T& val) { return val != 0; }, - stream, - labelCounts.data()); - auto numUniques = nUniq.value(stream); - return numUniques; -} - -/** - * @brief Function to calculate Adjusted RandIndex as described - * here - * @tparam T data-type for input label arrays - * @tparam MathT integral data-type used for computing n-choose-r - * @param firstClusterArray: the array of classes - * @param secondClusterArray: the array of classes - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -double compute_adjusted_rand_index(const T* firstClusterArray, - const T* secondClusterArray, - int size, - cudaStream_t stream) -{ - ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - T minFirst, maxFirst, minSecond, maxSecond; - auto nUniqFirst = countUnique(firstClusterArray, size, minFirst, maxFirst, stream); - auto nUniqSecond = countUnique(secondClusterArray, size, minSecond, maxSecond, stream); - auto lowerLabelRange = std::min(minFirst, minSecond); - auto upperLabelRange = std::max(maxFirst, maxSecond); - auto nClasses = upperLabelRange - lowerLabelRange + 1; - // degenerate case of single cluster or clusters each with just one element - if (nUniqFirst == nUniqSecond) { - if (nUniqFirst == 1 || nUniqFirst == size) return 1.0; - } - auto nUniqClasses = MathT(nClasses); - rmm::device_uvector dContingencyMatrix(nUniqClasses * nUniqClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - dContingencyMatrix.data(), 0, nUniqClasses * nUniqClasses * sizeof(MathT), stream)); - auto workspaceSz = getContingencyMatrixWorkspaceSize( - size, firstClusterArray, stream, lowerLabelRange, upperLabelRange); - rmm::device_uvector workspaceBuff(workspaceSz, stream); - contingencyMatrix(firstClusterArray, - secondClusterArray, - size, - dContingencyMatrix.data(), - stream, - workspaceBuff.data(), - workspaceSz, - lowerLabelRange, - upperLabelRange); - rmm::device_uvector a(nUniqClasses, stream); - rmm::device_uvector b(nUniqClasses, stream); - rmm::device_scalar d_aCTwoSum(stream); - rmm::device_scalar d_bCTwoSum(stream); - rmm::device_scalar d_nChooseTwoSum(stream); - MathT h_aCTwoSum, h_bCTwoSum, h_nChooseTwoSum; - RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, nUniqClasses * sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, nUniqClasses * sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aCTwoSum.data(), 0, sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_bCTwoSum.data(), 0, sizeof(MathT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_nChooseTwoSum.data(), 0, sizeof(MathT), stream)); - // calculating the sum of NijC2 - raft::linalg::mapThenSumReduce>(d_nChooseTwoSum.data(), - nUniqClasses * nUniqClasses, - nCTwo(), - stream, - dContingencyMatrix.data(), - dContingencyMatrix.data()); - // calculating the row-wise sums - raft::linalg::reduce( - a.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, true, stream); - // calculating the column-wise sums - raft::linalg::reduce( - b.data(), dContingencyMatrix.data(), nUniqClasses, nUniqClasses, 0, true, false, stream); - // calculating the sum of number of unordered pairs for every element in a - raft::linalg::mapThenSumReduce>( - d_aCTwoSum.data(), nUniqClasses, nCTwo(), stream, a.data(), a.data()); - // calculating the sum of number of unordered pairs for every element of b - raft::linalg::mapThenSumReduce>( - d_bCTwoSum.data(), nUniqClasses, nCTwo(), stream, b.data(), b.data()); - // updating in the host memory - raft::update_host(&h_nChooseTwoSum, d_nChooseTwoSum.data(), 1, stream); - raft::update_host(&h_aCTwoSum, d_aCTwoSum.data(), 1, stream); - raft::update_host(&h_bCTwoSum, d_bCTwoSum.data(), 1, stream); - // calculating the ARI - auto nChooseTwo = double(size) * double(size - 1) / 2.0; - auto expectedIndex = double(h_aCTwoSum) * double(h_bCTwoSum) / double(nChooseTwo); - auto maxIndex = (double(h_bCTwoSum) + double(h_aCTwoSum)) / 2.0; - auto index = double(h_nChooseTwoSum); - if (maxIndex - expectedIndex) - return (index - expectedIndex) / (maxIndex - expectedIndex); - else - return 0; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/batched/information_criterion.cuh b/cpp/src_prims/metrics/batched/information_criterion.cuh deleted file mode 100644 index 8770246f07..0000000000 --- a/cpp/src_prims/metrics/batched/information_criterion.cuh +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file information_criterion.cuh - * @brief These information criteria are used to evaluate the quality of models - * by balancing the quality of the fit and the number of parameters. - * - * See: - * - AIC: https://en.wikipedia.org/wiki/Akaike_information_criterion - * - AICc: https://en.wikipedia.org/wiki/Akaike_information_criterion#AICc - * - BIC: https://en.wikipedia.org/wiki/Bayesian_information_criterion - */ - -#include - -#include - -namespace MLCommon { -namespace Metrics { - -/// Supported types of information criteria -enum IC_Type { AIC, AICc, BIC }; - -namespace Batched { - -/** - * Compute the given type of information criterion - * - * @note: it is safe to do the computation in-place (i.e give same pointer - * as input and output) - * - * @param[out] d_ic Information criterion to be returned for each - * series (device) - * @param[in] d_loglikelihood Log-likelihood for each series (device) - * @param[in] ic_type Type of criterion to compute. See IC_Type - * @param[in] n_params Number of parameters in the model - * @param[in] batch_size Number of series in the batch - * @param[in] n_samples Number of samples in each series - * @param[in] stream CUDA stream - */ -template -void information_criterion(ScalarT* d_ic, - const ScalarT* d_loglikelihood, - IC_Type ic_type, - IdxT n_params, - IdxT batch_size, - IdxT n_samples, - cudaStream_t stream) -{ - ScalarT ic_base{}; - ScalarT N = static_cast(n_params); - ScalarT T = static_cast(n_samples); - switch (ic_type) { - case AIC: ic_base = (ScalarT)2.0 * N; break; - case AICc: - ic_base = (ScalarT)2.0 * (N + (N * (N + (ScalarT)1.0)) / (T - N - (ScalarT)1.0)); - break; - case BIC: ic_base = std::log(T) * N; break; - } - /* Compute information criterion from log-likelihood and base term */ - raft::linalg::unaryOp( - d_ic, - d_loglikelihood, - batch_size, - [=] __device__(ScalarT loglike) { return ic_base - (ScalarT)2.0 * loglike; }, - stream); -} - -} // namespace Batched -} // namespace Metrics -} // namespace MLCommon diff --git a/cpp/src_prims/metrics/batched/silhouette_score.cuh b/cpp/src_prims/metrics/batched/silhouette_score.cuh deleted file mode 100644 index 813393ce6b..0000000000 --- a/cpp/src_prims/metrics/batched/silhouette_score.cuh +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "../silhouette_score.cuh" -#include - -#include -#include -#include -#include -#include - -namespace MLCommon { -namespace Metrics { -namespace Batched { - -namespace detail { - -/** - * This kernel initializes matrix b (n_rows * n_labels) - * For each label that the corresponding row is not a part of is initialized as 0 - * If the corresponding row is the only sample in its label, again 0 - * Only if the there are > 1 samples in the label, row is initialized to max - */ -template -__global__ void fill_b_kernel(value_t* b, - const label_idx* y, - value_idx n_rows, - label_idx n_labels, - const value_idx* cluster_counts) -{ - value_idx idx = threadIdx.x + blockIdx.x * blockDim.x; - label_idx idy = threadIdx.y + blockIdx.y * blockDim.y; - - if (idx >= n_rows || idy >= n_labels) { return; } - - auto row_cluster = y[idx]; - - auto col_cluster_count = cluster_counts[idy]; - - // b for own cluster should be max value - // so that it does not interfere with min operator - // b is also max if col cluster count is 0 - // however, b is 0 if self cluster count is 1 - if (row_cluster == idy || col_cluster_count == 0) { - if (cluster_counts[row_cluster] == 1) { - b[idx * n_labels + idy] = 0; - } else { - b[idx * n_labels + idy] = std::numeric_limits::max(); - } - } else { - b[idx * n_labels + idy] = 0; - } -} - -/** - * This kernel does an elementwise sweep of chunked pairwise distance matrix - * By knowing the offsets of the chunked pairwise distance matrix in the - * global pairwise distance matrix, we are able to calculate - * intermediate values of a and b for the rows and columns present in the - * current chunked pairwise distance matrix. - */ -template -__global__ void compute_chunked_a_b_kernel(value_t* a, - value_t* b, - value_idx row_offset, - value_idx col_offset, - const label_idx* y, - label_idx n_labels, - const value_idx* cluster_counts, - const value_t* distances, - value_idx dist_rows, - value_idx dist_cols) -{ - value_idx row_id = threadIdx.x + blockIdx.x * blockDim.x; - value_idx col_id = threadIdx.y + blockIdx.y * blockDim.y; - - // these are global offsets of current element - // in the full pairwise distance matrix - value_idx pw_row_id = row_id + row_offset; - value_idx pw_col_id = col_id + col_offset; - - if (row_id >= dist_rows || col_id >= dist_cols || pw_row_id == pw_col_id) { return; } - - auto row_cluster = y[pw_row_id]; - if (cluster_counts[row_cluster] == 1) { return; } - - auto col_cluster = y[pw_col_id]; - auto col_cluster_counts = cluster_counts[col_cluster]; - - if (col_cluster == row_cluster) { - atomicAdd(&a[pw_row_id], distances[row_id * dist_cols + col_id] / (col_cluster_counts - 1)); - } else { - atomicAdd(&b[pw_row_id * n_labels + col_cluster], - distances[row_id * dist_cols + col_id] / col_cluster_counts); - } -} - -} // namespace detail - -template -rmm::device_uvector get_cluster_counts(const raft::handle_t& handle, - label_idx* y, - value_idx& n_rows, - label_idx& n_labels) -{ - auto stream = handle.get_stream(); - - rmm::device_uvector cluster_counts(n_labels, stream); - - rmm::device_uvector workspace(1, stream); - - MLCommon::Metrics::countLabels(y, cluster_counts.data(), n_rows, n_labels, workspace, stream); - - return cluster_counts; -} - -template -rmm::device_uvector get_pairwise_distance(const raft::handle_t& handle, - value_t* left_begin, - value_t* right_begin, - value_idx& n_left_rows, - value_idx& n_right_rows, - value_idx& n_cols, - raft::distance::DistanceType metric, - cudaStream_t stream) -{ - rmm::device_uvector distances(n_left_rows * n_right_rows, stream); - - ML::Metrics::pairwise_distance( - handle, left_begin, right_begin, distances.data(), n_left_rows, n_right_rows, n_cols, metric); - - return distances; -} - -template -void compute_chunked_a_b(const raft::handle_t& handle, - value_t* a, - value_t* b, - value_idx& row_offset, - value_idx& col_offset, - const label_idx* y, - label_idx& n_labels, - const value_idx* cluster_counts, - const value_t* distances, - value_idx& dist_rows, - value_idx& dist_cols, - cudaStream_t stream) -{ - dim3 block_size(std::min(dist_rows, 32), std::min(dist_cols, 32)); - dim3 grid_size(raft::ceildiv(dist_rows, (value_idx)block_size.x), - raft::ceildiv(dist_cols, (value_idx)block_size.y)); - - detail::compute_chunked_a_b_kernel<<>>( - a, b, row_offset, col_offset, y, n_labels, cluster_counts, distances, dist_rows, dist_cols); -} - -template -value_t silhouette_score( - const raft::handle_t& handle, - value_t* X, - value_idx n_rows, - value_idx n_cols, - label_idx* y, - label_idx n_labels, - value_t* scores, - value_idx chunk, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded) -{ - ASSERT(n_labels >= 2 && n_labels <= (n_rows - 1), - "silhouette Score not defined for the given number of labels!"); - - rmm::device_uvector cluster_counts = get_cluster_counts(handle, y, n_rows, n_labels); - - auto stream = handle.get_stream(); - auto policy = handle.get_thrust_policy(); - - auto b_size = n_rows * n_labels; - - value_t *a_ptr, *b_ptr; - rmm::device_uvector a(0, stream); - rmm::device_uvector b(b_size, stream); - - b_ptr = b.data(); - - // since a and silhouette score per sample are same size, reusing - if (scores == nullptr || scores == NULL) { - a.resize(n_rows, stream); - a_ptr = a.data(); - } else { - a_ptr = scores; - } - - thrust::fill(policy, a_ptr, a_ptr + n_rows, 0); - - dim3 block_size(std::min(n_rows, 32), std::min(n_labels, 32)); - dim3 grid_size(raft::ceildiv(n_rows, (value_idx)block_size.x), - raft::ceildiv(n_labels, (label_idx)block_size.y)); - detail::fill_b_kernel<<>>( - b_ptr, y, n_rows, n_labels, cluster_counts.data()); - - handle.wait_stream_pool_on_stream(); - - auto n_iters = 0; - - for (value_idx i = 0; i < n_rows; i += chunk) { - for (value_idx j = 0; j < n_rows; j += chunk) { - ++n_iters; - - auto chunk_stream = handle.get_next_usable_stream(i + chunk * j); - - auto* left_begin = X + (i * n_cols); - auto* right_begin = X + (j * n_cols); - - auto n_left_rows = (i + chunk) < n_rows ? chunk : (n_rows - i); - auto n_right_rows = (j + chunk) < n_rows ? chunk : (n_rows - j); - - rmm::device_uvector distances = get_pairwise_distance( - handle, left_begin, right_begin, n_left_rows, n_right_rows, n_cols, metric, chunk_stream); - - compute_chunked_a_b(handle, - a_ptr, - b_ptr, - i, - j, - y, - n_labels, - cluster_counts.data(), - distances.data(), - n_left_rows, - n_right_rows, - chunk_stream); - } - } - - handle.sync_stream_pool(); - - // calculating row-wise minimum in b - // this prim only supports int indices for now - raft::linalg:: - reduce, MLCommon::Metrics::MinOp>( - b_ptr, - b_ptr, - n_labels, - n_rows, - std::numeric_limits::max(), - true, - true, - stream, - false, - raft::Nop(), - MLCommon::Metrics::MinOp()); - - // calculating the silhouette score per sample - raft::linalg::binaryOp, value_t, value_idx>( - a_ptr, a_ptr, b_ptr, n_rows, MLCommon::Metrics::SilOp(), stream); - - return thrust::reduce(policy, a_ptr, a_ptr + n_rows, value_t(0)) / n_rows; -} - -} // namespace Batched -} // namespace Metrics -} // namespace MLCommon diff --git a/cpp/src_prims/metrics/completeness_score.cuh b/cpp/src_prims/metrics/completeness_score.cuh deleted file mode 100644 index d5805edc64..0000000000 --- a/cpp/src_prims/metrics/completeness_score.cuh +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file completeness_score.cuh - * - * @brief A clustering result satisfies completeness if all the data points - * that are members of a given class are elements of the same cluster. - */ - -#pragma once - -#include "entropy.cuh" -#include "mutual_info_score.cuh" - -namespace MLCommon { -namespace Metrics { - -/** - * @brief Function to calculate the completeness score between two clusters - * - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double completeness_score(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - if (size == 0) return 1.0; - - double computedMI, computedEntropy; - - computedMI = MLCommon::Metrics::mutual_info_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - computedEntropy = - MLCommon::Metrics::entropy(predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - - double completeness; - - if (computedEntropy) { - completeness = computedMI / computedEntropy; - } else - completeness = 1.0; - - return completeness; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/contingencyMatrix.cuh b/cpp/src_prims/metrics/contingencyMatrix.cuh deleted file mode 100644 index 9fc0526565..0000000000 --- a/cpp/src_prims/metrics/contingencyMatrix.cuh +++ /dev/null @@ -1,312 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include - -#include - -#include - -namespace MLCommon { -namespace Metrics { - -typedef enum { - IMPL_NONE, - SMEM_ATOMICS, - GLOBAL_ATOMICS, - SORT_AND_GATOMICS -} ContingencyMatrixImplType; - -template -__global__ void devConstructContingencyMatrix(const T* groundTruth, - const T* predicted, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outMatWidth) -{ - int elementId = threadIdx.x + blockDim.x * blockIdx.x; - if (elementId < nSamples) { - T gt = groundTruth[elementId]; - T pd = predicted[elementId]; - auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; - raft::myAtomicAdd(outMat + outputIdx, OutT(1)); - } -} - -template -void computeCMatWAtomics(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outDimN, - cudaStream_t stream) -{ - RAFT_CUDA_TRY( - cudaFuncSetCacheConfig(devConstructContingencyMatrix, cudaFuncCachePreferL1)); - static const int block = 128; - auto grid = raft::ceildiv(nSamples, block); - devConstructContingencyMatrix<<>>( - groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -__global__ void devConstructContingencyMatrixSmem(const T* groundTruth, - const T* predicted, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outMatWidth) -{ - extern __shared__ char smem[]; - auto* sMemMatrix = reinterpret_cast(smem); - for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) { - sMemMatrix[smemIdx] = 0; - } - __syncthreads(); - int elementId = threadIdx.x + blockDim.x * blockIdx.x; - if (elementId < nSamples) { - T gt = groundTruth[elementId]; - T pd = predicted[elementId]; - auto outputIdx = (gt - outIdxOffset) * outMatWidth + pd - outIdxOffset; - raft::myAtomicAdd(sMemMatrix + outputIdx, OutT(1)); - } - __syncthreads(); - for (int smemIdx = threadIdx.x; smemIdx < outMatWidth * outMatWidth; smemIdx += blockDim.x) { - raft::myAtomicAdd(outMat + smemIdx, sMemMatrix[smemIdx]); - } -} - -template -void computeCMatWSmemAtomics(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - int outIdxOffset, - int outDimN, - cudaStream_t stream) -{ - static const int block = 128; - auto grid = raft::ceildiv(nSamples, block); - size_t smemSizePerBlock = outDimN * outDimN * sizeof(OutT); - devConstructContingencyMatrixSmem<<>>( - groundTruth, predictedLabel, nSamples, outMat, outIdxOffset, outDimN); - RAFT_CUDA_TRY(cudaGetLastError()); -} - -template -void contingencyMatrixWSort(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - T minLabel, - T maxLabel, - void* workspace, - size_t workspaceSize, - cudaStream_t stream) -{ - T* outKeys = reinterpret_cast(workspace); - auto alignedBufferSz = raft::alignTo(nSamples * sizeof(T), 256); - T* outValue = reinterpret_cast((size_t)workspace + alignedBufferSz); - void* pWorkspaceCub = reinterpret_cast((size_t)workspace + 2 * alignedBufferSz); - auto bitsToSort = log2(maxLabel); - if (!raft::isPo2(maxLabel)) ++bitsToSort; - // we dont really need perfect sorting, should get by with some sort of - // binning-reordering operation - ///@todo: future work - explore "efficient" custom binning kernels vs cub sort - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs(pWorkspaceCub, - workspaceSize, - groundTruth, - outKeys, - predictedLabel, - outValue, - nSamples, - 0, - bitsToSort, - stream)); - auto outDimM_N = int(maxLabel - minLabel + 1); - computeCMatWAtomics(outKeys, outValue, nSamples, outMat, minLabel, outDimM_N, stream); -} - -template -ContingencyMatrixImplType getImplVersion(OutT outDimN) -{ - int currDevice = 0; - int l2CacheSize = 0; - // no way to query this from CUDA APIs, value for CC 7.0, 3.0 - int maxBlocksResidentPerSM = 16; - RAFT_CUDA_TRY(cudaGetDevice(&currDevice)); - RAFT_CUDA_TRY(cudaDeviceGetAttribute(&l2CacheSize, cudaDevAttrL2CacheSize, currDevice)); - auto maxSmemPerBlock = raft::getSharedMemPerBlock(); - ContingencyMatrixImplType implVersion = IMPL_NONE; - // keeping 8 block per SM to get good utilization - // can go higher but reduced L1 size degrades perf - OutT upperLimitSmemAtomics = - std::floor(std::sqrt(maxSmemPerBlock / (sizeof(OutT) * (maxBlocksResidentPerSM / 2)))); - OutT upperLimitL2Atomics = std::floor(std::sqrt(l2CacheSize / sizeof(OutT))); - if (outDimN <= upperLimitSmemAtomics) - implVersion = SMEM_ATOMICS; - else if (outDimN <= upperLimitL2Atomics) - implVersion = GLOBAL_ATOMICS; - else - implVersion = SORT_AND_GATOMICS; - return implVersion; -} - -/** - * @brief use this to allocate output matrix size - * size of matrix = (maxLabel - minLabel + 1)^2 * sizeof(int) - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param nSamples: number of elements in input array - * @param stream: cuda stream for execution - * @param minLabel: [out] calculated min value in input array - * @param maxLabel: [out] calculated max value in input array - */ -template -void getInputClassCardinality( - const T* groundTruth, const int nSamples, cudaStream_t stream, T& minLabel, T& maxLabel) -{ - thrust::device_ptr dTrueLabel = thrust::device_pointer_cast(groundTruth); - auto min_max = - thrust::minmax_element(thrust::cuda::par.on(stream), dTrueLabel, dTrueLabel + nSamples); - minLabel = *min_max.first; - maxLabel = *min_max.second; -} - -/** - * @brief Calculate workspace size for running contingency matrix calculations - * @tparam T label type - * @tparam OutT output matrix type - * @param nSamples: number of elements in input array - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param stream: cuda stream for execution - * @param minLabel: Optional, min value in input array - * @param maxLabel: Optional, max value in input array - */ -template -size_t getContingencyMatrixWorkspaceSize(int nSamples, - const T* groundTruth, - cudaStream_t stream, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - size_t workspaceSize = 0; - // below is a redundant computation - can be avoided - if (minLabel == std::numeric_limits::max() || maxLabel == std::numeric_limits::max()) { - getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel); - } - auto outDimN = OutT(maxLabel - minLabel + 1); - ContingencyMatrixImplType implVersion = getImplVersion(outDimN); - if (implVersion == SORT_AND_GATOMICS) { - void* pWorkspaceCub{}; - size_t tmpStorageBytes = 0; - // no-op pointers to get workspace size - T* pTmpUnused{}; - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs( - pWorkspaceCub, tmpStorageBytes, pTmpUnused, pTmpUnused, pTmpUnused, pTmpUnused, nSamples)); - auto tmpStagingMemorySize = raft::alignTo(nSamples * sizeof(T), 256); - tmpStagingMemorySize *= 2; - workspaceSize = tmpStagingMemorySize + tmpStorageBytes; - } - return workspaceSize; -} - -/** - * @brief contruct contingency matrix given input ground truth and prediction - * labels. Users should call function getInputClassCardinality to find - * and allocate memory for output. Similarly workspace requirements - * should be checked using function getContingencyMatrixWorkspaceSize - * @tparam T label type - * @tparam OutT output matrix type - * @param groundTruth: device 1-d array for ground truth (num of rows) - * @param predictedLabel: device 1-d array for prediction (num of columns) - * @param nSamples: number of elements in input array - * @param outMat: output buffer for contingecy matrix - * @param stream: cuda stream for execution - * @param workspace: Optional, workspace memory allocation - * @param workspaceSize: Optional, size of workspace memory - * @param minLabel: Optional, min value in input ground truth array - * @param maxLabel: Optional, max value in input ground truth array - */ -template -void contingencyMatrix(const T* groundTruth, - const T* predictedLabel, - int nSamples, - OutT* outMat, - cudaStream_t stream, - void* workspace = nullptr, - size_t workspaceSize = 0, - T minLabel = std::numeric_limits::max(), - T maxLabel = std::numeric_limits::max()) -{ - // assumptions: - // output is not at par with scikit learn - output will be square matrix - // always with numRows = numColumns = numOfClassesInTrueLabel - // it is also assumed that true labels are monotically increasing - // if for some reason groundTruth completely skips some labels - // eg: {0,1,2,5} instead of {0,1,2,3}. - // Output matrix will still have empty rows for label value {3,4} - // Users can use "make_monotonic" to convert their discontinuous input label - // range to a monotonically increasing one // - // this also serves as way to measure co-occurence/joint counts for NLP tasks which - // can be used to then compute pointwise mutual information and mutual information - if (minLabel == std::numeric_limits::max() || maxLabel == std::numeric_limits::max()) { - getInputClassCardinality(groundTruth, nSamples, stream, minLabel, maxLabel); - } - auto outDimM_N = OutT(maxLabel - minLabel + 1); - RAFT_CUDA_TRY(cudaMemsetAsync(outMat, 0, sizeof(OutT) * outDimM_N * outDimM_N, stream)); - ContingencyMatrixImplType implVersion = getImplVersion(outDimM_N); - switch (implVersion) { - case SMEM_ATOMICS: - // smem atomics and then single global mem atomics only works - // when all label count can fit in smem for a block - // helps when GLOBAL_ATOMICS performance blocked by atomic update - // serialization -when very less labels ~10 labels - computeCMatWSmemAtomics( - groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream); - break; - case GLOBAL_ATOMICS: - // launch kernel - global atomic ops per (groundTruth,predictedValue) pair - computeCMatWAtomics( - groundTruth, predictedLabel, nSamples, outMat, minLabel, outDimM_N, stream); - break; - // more L2 thrashing if atomic OPs land in completely different mem - // segment - when more labels - case SORT_AND_GATOMICS: - contingencyMatrixWSort(groundTruth, - predictedLabel, - nSamples, - outMat, - minLabel, - maxLabel, - workspace, - workspaceSize, - stream); - break; - case IMPL_NONE: break; - } -} - -}; // namespace Metrics -}; // namespace MLCommon diff --git a/cpp/src_prims/metrics/dispersion.cuh b/cpp/src_prims/metrics/dispersion.cuh deleted file mode 100644 index b2d3c007fb..0000000000 --- a/cpp/src_prims/metrics/dispersion.cuh +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace MLCommon { -namespace Metrics { - -///@todo: ColsPerBlk has been tested only for 32! -template -__global__ void weightedMeanKernel(DataT* mu, const DataT* data, const IdxT* counts, IdxT D, IdxT N) -{ - constexpr int RowsPerBlkPerIter = TPB / ColsPerBlk; - IdxT thisColId = threadIdx.x % ColsPerBlk; - IdxT thisRowId = threadIdx.x / ColsPerBlk; - IdxT colId = thisColId + ((IdxT)blockIdx.y * ColsPerBlk); - IdxT rowId = thisRowId + ((IdxT)blockIdx.x * RowsPerBlkPerIter); - DataT thread_data = DataT(0); - const IdxT stride = RowsPerBlkPerIter * gridDim.x; - __shared__ DataT smu[ColsPerBlk]; - if (threadIdx.x < ColsPerBlk) smu[threadIdx.x] = DataT(0); - for (IdxT i = rowId; i < N; i += stride) { - thread_data += (colId < D) ? data[i * D + colId] * (DataT)counts[i] : DataT(0); - } - __syncthreads(); - raft::myAtomicAdd(smu + thisColId, thread_data); - __syncthreads(); - if (threadIdx.x < ColsPerBlk && colId < D) raft::myAtomicAdd(mu + colId, smu[thisColId]); -} - -template -__global__ void dispersionKernel(DataT* result, - const DataT* clusters, - const IdxT* clusterSizes, - const DataT* mu, - IdxT dim, - IdxT nClusters) -{ - IdxT tid = threadIdx.x + blockIdx.x * blockDim.x; - IdxT len = dim * nClusters; - IdxT stride = blockDim.x * gridDim.x; - DataT sum = DataT(0); - for (; tid < len; tid += stride) { - IdxT col = tid % dim; - IdxT row = tid / dim; - DataT diff = clusters[tid] - mu[col]; - sum += diff * diff * DataT(clusterSizes[row]); - } - typedef cub::BlockReduce BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - __syncthreads(); - auto acc = BlockReduce(temp_storage).Sum(sum); - __syncthreads(); - if (threadIdx.x == 0) raft::myAtomicAdd(result, acc); -} - -/** - * @brief Compute cluster dispersion metric. This is very useful for - * automatically finding the 'k' (in kmeans) that improves this metric. - * @tparam DataT data type - * @tparam IdxT index type - * @tparam TPB threads block for kernels launched - * @param centroids the cluster centroids. This is assumed to be row-major - * and of dimension (nClusters x dim) - * @param clusterSizes number of points in the dataset which belong to each - * cluster. This is of length nClusters - * @param globalCentroid compute the global weighted centroid of all cluster - * centroids. This is of length dim. Pass a nullptr if this is not needed - * @param nClusters number of clusters - * @param nPoints number of points in the dataset - * @param dim dataset dimensionality - * @param stream cuda stream - * @return the cluster dispersion value - */ -template -DataT dispersion(const DataT* centroids, - const IdxT* clusterSizes, - DataT* globalCentroid, - IdxT nClusters, - IdxT nPoints, - IdxT dim, - cudaStream_t stream) -{ - static const int RowsPerThread = 4; - static const int ColsPerBlk = 32; - static const int RowsPerBlk = (TPB / ColsPerBlk) * RowsPerThread; - dim3 grid(raft::ceildiv(nPoints, (IdxT)RowsPerBlk), raft::ceildiv(dim, (IdxT)ColsPerBlk)); - rmm::device_uvector mean(0, stream); - rmm::device_uvector result(1, stream); - DataT* mu = globalCentroid; - if (globalCentroid == nullptr) { - mean.resize(dim, stream); - mu = mean.data(); - } - RAFT_CUDA_TRY(cudaMemsetAsync(mu, 0, sizeof(DataT) * dim, stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(result.data(), 0, sizeof(DataT), stream)); - weightedMeanKernel - <<>>(mu, centroids, clusterSizes, dim, nClusters); - RAFT_CUDA_TRY(cudaGetLastError()); - DataT ratio = DataT(1) / DataT(nPoints); - raft::linalg::scalarMultiply(mu, mu, ratio, dim, stream); - // finally, compute the dispersion - constexpr int ItemsPerThread = 4; - int nblks = raft::ceildiv(dim * nClusters, TPB * ItemsPerThread); - dispersionKernel - <<>>(result.data(), centroids, clusterSizes, mu, dim, nClusters); - RAFT_CUDA_TRY(cudaGetLastError()); - DataT h_result; - raft::update_host(&h_result, result.data(), 1, stream); - raft::interruptible::synchronize(stream); - return sqrt(h_result); -} - -} // end namespace Metrics -} // end namespace MLCommon diff --git a/cpp/src_prims/metrics/entropy.cuh b/cpp/src_prims/metrics/entropy.cuh deleted file mode 100644 index 55650a3345..0000000000 --- a/cpp/src_prims/metrics/entropy.cuh +++ /dev/null @@ -1,152 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file entropy.cuh - * @brief Calculates the entropy for a labeling in nats.(ie, uses natural logarithm for the - * calculations) - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace MLCommon { - -/** - * @brief Lambda to calculate the entropy of a sample given its probability value - * - * @param p: the input to the functional mapping - * @param q: dummy param - */ -struct entropyOp { - HDI double operator()(double p, double q) - { - if (p) - return -1 * (p) * (log(p)); - else - return 0.0; - } -}; - -namespace Metrics { - -/** - * @brief function to calculate the bincounts of number of samples in every label - * - * @tparam LabelT: type of the labels - * @param labels: the pointer to the array containing labels for every data sample - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster - * @param nRows: number of data samples - * @param lowerLabelRange - * @param upperLabelRange - * @param workspace: device buffer containing workspace memory - * @param stream: the cuda stream where to launch this kernel - */ -template -void countLabels(const LabelT* labels, - double* binCountArray, - int nRows, - LabelT lowerLabelRange, - LabelT upperLabelRange, - rmm::device_uvector& workspace, - cudaStream_t stream) -{ - int num_levels = upperLabelRange - lowerLabelRange + 2; - LabelT lower_level = lowerLabelRange; - LabelT upper_level = upperLabelRange + 1; - size_t temp_storage_bytes = 0; - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); -} - -/** - * @brief Function to calculate entropy - * more info on entropy - * - * @param clusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @return the entropy score - */ -template -double entropy(const T* clusterArray, - const int size, - const T lowerLabelRange, - const T upperLabelRange, - cudaStream_t stream) -{ - if (!size) return 1.0; - - T numUniqueClasses = upperLabelRange - lowerLabelRange + 1; - - // declaring, allocating and initializing memory for bincount array and entropy values - rmm::device_uvector prob(numUniqueClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(prob.data(), 0, numUniqueClasses * sizeof(double), stream)); - rmm::device_scalar d_entropy(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_entropy.data(), 0, sizeof(double), stream)); - - // workspace allocation - rmm::device_uvector workspace(1, stream); - - // calculating the bincounts and populating the prob array - countLabels(clusterArray, prob.data(), size, lowerLabelRange, upperLabelRange, workspace, stream); - - // scalar dividing by size - raft::linalg::divideScalar( - prob.data(), prob.data(), (double)size, numUniqueClasses, stream); - - // calculating the aggregate entropy - raft::linalg::mapThenSumReduce( - d_entropy.data(), numUniqueClasses, entropyOp(), stream, prob.data(), prob.data()); - - // updating in the host memory - double h_entropy; - raft::update_host(&h_entropy, d_entropy.data(), 1, stream); - - raft::interruptible::synchronize(stream); - - return h_entropy; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/homogeneity_score.cuh b/cpp/src_prims/metrics/homogeneity_score.cuh deleted file mode 100644 index 8ba90b38e8..0000000000 --- a/cpp/src_prims/metrics/homogeneity_score.cuh +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file homogeneity_score.cuh - * - * @brief A clustering result satisfies homogeneity if all of its clusters - * contain only data points which are members of a single class. - */ - -#include "entropy.cuh" -#include "mutual_info_score.cuh" -#include - -namespace MLCommon { - -namespace Metrics { - -/** - * @brief Function to calculate the homogeneity score between two clusters - * more info on mutual - * information - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double homogeneity_score(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - if (size == 0) return 1.0; - - double computedMI, computedEntropy; - - computedMI = MLCommon::Metrics::mutual_info_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - computedEntropy = - MLCommon::Metrics::entropy(truthClusterArray, size, lowerLabelRange, upperLabelRange, stream); - - double homogeneity; - - if (computedEntropy) { - homogeneity = computedMI / computedEntropy; - } else - homogeneity = 1.0; - - return homogeneity; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/kl_divergence.cuh b/cpp/src_prims/metrics/kl_divergence.cuh deleted file mode 100644 index bce3bf7283..0000000000 --- a/cpp/src_prims/metrics/kl_divergence.cuh +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file kl_divergence.cuh - * @brief The KL divergence tells us how well the probability distribution Q AKA candidatePDF - * approximates the probability distribution P AKA modelPDF. - */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace MLCommon { - -/** - * @brief the KL Diverence mapping function - * - * @tparam Type: Data type of the input - * @param modelPDF: the model probability density function of type DataT - * @param candidatePDF: the candidate probability density function of type DataT - */ -template -struct KLDOp { - HDI Type operator()(Type modelPDF, Type candidatePDF) - { - if (modelPDF == 0.0) - return 0; - - else - return modelPDF * (log(modelPDF) - log(candidatePDF)); - } -}; - -namespace Metrics { - -/** - * @brief Function to calculate KL Divergence - * more info on KL - * Divergence - * - * @tparam DataT: Data type of the input array - * @param modelPDF: the model array of probability density functions of type DataT - * @param candidatePDF: the candidate array of probability density functions of type DataT - * @param size: the size of the data points of type int - * @param stream: the cudaStream object - */ -template -DataT kl_divergence(const DataT* modelPDF, const DataT* candidatePDF, int size, cudaStream_t stream) -{ - rmm::device_scalar d_KLDVal(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_KLDVal.data(), 0, sizeof(DataT), stream)); - - raft::linalg::mapThenSumReduce, 256, const DataT*>( - d_KLDVal.data(), (size_t)size, KLDOp(), stream, modelPDF, candidatePDF); - - DataT h_KLDVal; - - raft::update_host(&h_KLDVal, d_KLDVal.data(), 1, stream); - - raft::interruptible::synchronize(stream); - - return h_KLDVal; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/mutual_info_score.cuh b/cpp/src_prims/metrics/mutual_info_score.cuh deleted file mode 100644 index f20de778e4..0000000000 --- a/cpp/src_prims/metrics/mutual_info_score.cuh +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file mutual_info_score.cuh - * @brief The Mutual Information is a measure of the similarity between two labels of - * the same data.This metric is independent of the absolute values of the labels: - * a permutation of the class or cluster label values won't change the - * score value in any way. - * This metric is furthermore symmetric.This can be useful to - * measure the agreement of two independent label assignments strategies - * on the same dataset when the real ground truth is not known. - */ - -#include "contingencyMatrix.cuh" -#include -#include -#include -#include -#include -#include -#include - -namespace MLCommon { - -namespace Metrics { - -/** - * @brief kernel to calculate the mutual info score - * @param dContingencyMatrix: the contingency matrix corresponding to the two clusters - * @param a: the row wise sum of the contingency matrix, which is also the bin counts of first - * cluster array - * @param b: the column wise sum of the contingency matrix, which is also the bin counts of second - * cluster array - * @param numUniqueClasses: number of unique classes - * @param size: the size of array a and b (size of the contingency matrix is (size x size)) - * @param d_MI: pointer to the device memory that stores the aggreggate mutual information - */ -template -__global__ void mutual_info_kernel(const int* dContingencyMatrix, - const int* a, - const int* b, - int numUniqueClasses, - int size, - double* d_MI) -{ - // calculating the indices of pairs of datapoints compared by the current thread - int j = threadIdx.x + blockIdx.x * blockDim.x; - int i = threadIdx.y + blockIdx.y * blockDim.y; - - // thread-local variable to count the mutual info - double localMI = 0.0; - - if (i < numUniqueClasses && j < numUniqueClasses && a[i] * b[j] != 0 && - dContingencyMatrix[i * numUniqueClasses + j] != 0) { - localMI += (double(dContingencyMatrix[i * numUniqueClasses + j])) * - (log(double(size) * double(dContingencyMatrix[i * numUniqueClasses + j])) - - log(double(a[i] * b[j]))); - } - - // specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce - BlockReduce; - - // Allocate shared memory for blockReduce - __shared__ typename BlockReduce::TempStorage temp_storage; - - // summing up thread-local counts specific to a block - localMI = BlockReduce(temp_storage).Sum(localMI); - __syncthreads(); - - // executed once per block - if (threadIdx.x == 0 && threadIdx.y == 0) { raft::myAtomicAdd(d_MI, localMI); } -} - -/** - * @brief Function to calculate the mutual information between two clusters - * more info on mutual information - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - */ -template -double mutual_info_score(const T* firstClusterArray, - const T* secondClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream) -{ - int numUniqueClasses = upperLabelRange - lowerLabelRange + 1; - - // declaring, allocating and initializing memory for the contingency marix - rmm::device_uvector dContingencyMatrix(numUniqueClasses * numUniqueClasses, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - dContingencyMatrix.data(), 0, numUniqueClasses * numUniqueClasses * sizeof(int), stream)); - - // workspace allocation - size_t workspaceSz = MLCommon::Metrics::getContingencyMatrixWorkspaceSize( - size, firstClusterArray, stream, lowerLabelRange, upperLabelRange); - rmm::device_uvector pWorkspace(workspaceSz, stream); - - // calculating the contingency matrix - MLCommon::Metrics::contingencyMatrix(firstClusterArray, - secondClusterArray, - (int)size, - (int*)dContingencyMatrix.data(), - stream, - (void*)pWorkspace.data(), - workspaceSz, - lowerLabelRange, - upperLabelRange); - - // creating device buffers for all the parameters involved in ARI calculation - // device variables - rmm::device_uvector a(numUniqueClasses, stream); - rmm::device_uvector b(numUniqueClasses, stream); - rmm::device_scalar d_MI(stream); - - // host variables - double h_MI; - - // initializing device memory - RAFT_CUDA_TRY(cudaMemsetAsync(a.data(), 0, numUniqueClasses * sizeof(int), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(b.data(), 0, numUniqueClasses * sizeof(int), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_MI.data(), 0, sizeof(double), stream)); - - // calculating the row-wise sums - raft::linalg::reduce( - a.data(), dContingencyMatrix.data(), numUniqueClasses, numUniqueClasses, 0, true, true, stream); - - // calculating the column-wise sums - raft::linalg::reduce(b.data(), - dContingencyMatrix.data(), - numUniqueClasses, - numUniqueClasses, - 0, - true, - false, - stream); - - // kernel configuration - static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; - dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 numBlocks(raft::ceildiv(numUniqueClasses, numThreadsPerBlock.x), - raft::ceildiv(numUniqueClasses, numThreadsPerBlock.y)); - - // calling the kernel - mutual_info_kernel<<>>( - dContingencyMatrix.data(), a.data(), b.data(), numUniqueClasses, size, d_MI.data()); - - // updating in the host memory - h_MI = d_MI.value(stream); - - raft::interruptible::synchronize(stream); - - return h_MI / size; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/rand_index.cuh b/cpp/src_prims/metrics/rand_index.cuh deleted file mode 100644 index f1acf30ac5..0000000000 --- a/cpp/src_prims/metrics/rand_index.cuh +++ /dev/null @@ -1,164 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * @file rand_index.cuh - * @todo TODO(Ganesh Venkataramana): - *
- * The below rand_index calculation implementation is a Brute force one that uses
- (nElements*nElements) threads (2 dimensional grids and blocks)
- * For small datasets, this will suffice; but for larger ones, work done by the threads increase
- dramatically.
- * A more mathematically intensive implementation that uses half the above threads can be done,
- which will prove to be more efficient for larger datasets
- * the idea is as follows:
-  * instead of 2D block and grid configuration with a total of (nElements*nElements) threads (where
- each (i,j) through these threads represent an ordered pair selection of 2 data points), a 1D block
- and grid configuration with a total of (nElements*(nElements))/2 threads (each thread index
- represents an element part of the set of unordered pairwise selections from the dataset (nChoose2))
-  * In this setup, one has to generate a one-to-one mapping between this 1D thread index (for each
- kernel) and the unordered pair of chosen datapoints.
-  * More specifically, thread0-> {dataPoint1, dataPoint0}, thread1-> {dataPoint2, dataPoint0},
- thread2-> {dataPoint2, dataPoint1} ... thread((nElements*(nElements))/2 - 1)->
- {dataPoint(nElements-1),dataPoint(nElements-2)}
-  * say ,
-     * threadNum: thread index | threadNum = threadIdx.x + BlockIdx.x*BlockDim.x,
-     * i : index of dataPoint i
-     * j : index of dataPoint j
-  * then the mapping is as follows:
-     * i = ceil((-1 + sqrt(1 + 8*(1 + threadNum)))/2) = floor((1 + sqrt(1 + 8*threadNum))/2)
-     * j = threadNum - i(i-1)/2
-  * after obtaining the the pair of datapoints, calculation of rand index is the same as done in
- this implementation
- * Caveat: since the kernel implementation involves use of emulated sqrt() operations:
-  * the number of instructions executed per kernel is ~40-50 times
-  * as the O(nElements*nElements) increase beyond the floating point limit, floating point
- inaccuracies occur, and hence the above floor(...) !=  ceil(...)
- * 
- */ - -#pragma once - -#include -#include -#include -#include -#include - -namespace MLCommon { -namespace Metrics { - -/** - * @brief kernel to calculate the values of a and b - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points - * @param a: number of pairs of points that both the clusters have classified the same - * @param b: number of pairs of points that both the clusters have classified differently - */ -template -__global__ void computeTheNumerator( - const T* firstClusterArray, const T* secondClusterArray, uint64_t size, uint64_t* a, uint64_t* b) -{ - // calculating the indices of pairs of datapoints compared by the current thread - uint64_t j = threadIdx.x + blockIdx.x * blockDim.x; - uint64_t i = threadIdx.y + blockIdx.y * blockDim.y; - - // thread-local variables to count a and b - uint64_t myA = 0, myB = 0; - - if (i < size && j < size && j < i) { - // checking if the pair have been classified the same by both the clusters - if (firstClusterArray[i] == firstClusterArray[j] && - secondClusterArray[i] == secondClusterArray[j]) { - ++myA; - } - - // checking if the pair have been classified differently by both the clusters - else if (firstClusterArray[i] != firstClusterArray[j] && - secondClusterArray[i] != secondClusterArray[j]) { - ++myB; - } - } - - // specialize blockReduce for a 2D block of 1024 threads of type uint64_t - typedef cub::BlockReduce - BlockReduce; - - // Allocate shared memory for blockReduce - __shared__ typename BlockReduce::TempStorage temp_storage; - - // summing up thread-local counts specific to a block - myA = BlockReduce(temp_storage).Sum(myA); - __syncthreads(); - myB = BlockReduce(temp_storage).Sum(myB); - __syncthreads(); - - // executed once per block - if (threadIdx.x == 0 && threadIdx.y == 0) { - raft::myAtomicAdd((unsigned long long int*)a, myA); - raft::myAtomicAdd((unsigned long long int*)b, myB); - } -} - -/** - * @brief Function to calculate RandIndex - * more info on rand index - * @param firstClusterArray: the array of classes of type T - * @param secondClusterArray: the array of classes of type T - * @param size: the size of the data points of type uint64_t - * @param stream: the cudaStream object - */ -template -double compute_rand_index(T* firstClusterArray, - T* secondClusterArray, - uint64_t size, - cudaStream_t stream) -{ - // rand index for size less than 2 is not defined - ASSERT(size >= 2, "Rand Index for size less than 2 not defined!"); - - // allocating and initializing memory for a and b in the GPU - rmm::device_uvector arr_buf(2, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(arr_buf.data(), 0, 2 * sizeof(uint64_t), stream)); - - // kernel configuration - static const int BLOCK_DIM_Y = 16, BLOCK_DIM_X = 16; - dim3 numThreadsPerBlock(BLOCK_DIM_X, BLOCK_DIM_Y); - dim3 numBlocks(raft::ceildiv(size, numThreadsPerBlock.x), - raft::ceildiv(size, numThreadsPerBlock.y)); - - // calling the kernel - computeTheNumerator<<>>( - firstClusterArray, secondClusterArray, size, arr_buf.data(), arr_buf.data() + 1); - - // synchronizing and updating the calculated values of a and b from device to host - uint64_t ab_host[2] = {0}; - raft::update_host(ab_host, arr_buf.data(), 2, stream); - raft::interruptible::synchronize(stream); - - // error handling - RAFT_CUDA_TRY(cudaGetLastError()); - - // denominator - uint64_t nChooseTwo = size * (size - 1) / 2; - - // calculating the rand_index - return (double)(((double)(ab_host[0] + ab_host[1])) / (double)nChooseTwo); -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/metrics/scores.cuh b/cpp/src_prims/metrics/scores.cuh deleted file mode 100644 index 50fa3eca04..0000000000 --- a/cpp/src_prims/metrics/scores.cuh +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define N_THREADS 512 - -namespace MLCommon { -namespace Score { - -/** - * Calculates the "Coefficient of Determination" (R-Squared) score - * normalizing the sum of squared errors by the total sum of squares. - * - * This score indicates the proportionate amount of variation in an - * expected response variable is explained by the independent variables - * in a linear regression model. The larger the R-squared value, the - * more variability is explained by the linear regression model. - * - * @param y: Array of ground-truth response variables - * @param y_hat: Array of predicted response variables - * @param n: Number of elements in y and y_hat - * @param stream: cuda stream - * @return: The R-squared value. - */ -template -math_t r2_score(math_t* y, math_t* y_hat, int n, cudaStream_t stream) -{ - rmm::device_scalar y_bar(stream); - - raft::stats::mean(y_bar.data(), y, 1, n, false, false, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - rmm::device_uvector sse_arr(n, stream); - - raft::linalg::eltwiseSub(sse_arr.data(), y, y_hat, n, stream); - raft::linalg::powerScalar(sse_arr.data(), sse_arr.data(), math_t(2.0), n, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - rmm::device_uvector ssto_arr(n, stream); - - raft::linalg::subtractDevScalar(ssto_arr.data(), y, y_bar.data(), n, stream); - raft::linalg::powerScalar(ssto_arr.data(), ssto_arr.data(), math_t(2.0), n, stream); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - thrust::device_ptr d_sse = thrust::device_pointer_cast(sse_arr.data()); - thrust::device_ptr d_ssto = thrust::device_pointer_cast(ssto_arr.data()); - - math_t sse = thrust::reduce(thrust::cuda::par.on(stream), d_sse, d_sse + n); - math_t ssto = thrust::reduce(thrust::cuda::par.on(stream), d_ssto, d_ssto + n); - - return 1.0 - sse / ssto; -} - -/** - * @brief Compute accuracy of predictions. Useful for classification. - * @tparam math_t: data type for predictions (e.g., int for classification) - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. - * @param[in] stream: cuda stream. - * @return: Accuracy score in [0, 1]; higher is better. - */ -template -float accuracy_score(const math_t* predictions, - const math_t* ref_predictions, - int n, - cudaStream_t stream) -{ - unsigned long long correctly_predicted = 0ULL; - rmm::device_uvector diffs_array(n, stream); - - // TODO could write a kernel instead - raft::linalg::eltwiseSub(diffs_array.data(), predictions, ref_predictions, n, stream); - RAFT_CUDA_TRY(cudaGetLastError()); - correctly_predicted = - thrust::count(thrust::cuda::par.on(stream), diffs_array.data(), diffs_array.data() + n, 0); - - float accuracy = correctly_predicted * 1.0f / n; - return accuracy; -} - -template -__global__ void reg_metrics_kernel( - const T* predictions, const T* ref_predictions, int n, double* abs_diffs, double* tmp_sums) -{ - int tid = threadIdx.x + blockIdx.x * blockDim.x; - __shared__ double shmem[2]; // {abs_difference_sum, squared difference sum} - - for (int i = threadIdx.x; i < 2; i += blockDim.x) { - shmem[i] = 0; - } - __syncthreads(); - - for (int i = tid; i < n; i += blockDim.x * gridDim.x) { - double diff = predictions[i] - ref_predictions[i]; - double abs_diff = abs(diff); - raft::myAtomicAdd(&shmem[0], abs_diff); - raft::myAtomicAdd(&shmem[1], diff * diff); - - // update absolute difference in global memory for subsequent abs. median computation - abs_diffs[i] = abs_diff; - } - __syncthreads(); - - // Update tmp_sum w/ total abs_difference_sum and squared difference sum. - for (int i = threadIdx.x; i < 2; i += blockDim.x) { - raft::myAtomicAdd(&tmp_sums[i], shmem[i]); - } -} - -/** - * @brief Compute regression metrics mean absolute error, mean squared error, median absolute error - * @tparam T: data type for predictions (e.g., float or double for regression). - * @param[in] predictions: array of predictions (GPU pointer). - * @param[in] ref_predictions: array of reference (ground-truth) predictions (GPU pointer). - * @param[in] n: number of elements in each of predictions, ref_predictions. Should be > 0. - * @param[in] stream: cuda stream. - * @param[out] mean_abs_error: Mean Absolute Error. Sum over n of (|predictions[i] - - * ref_predictions[i]|) / n. - * @param[out] mean_squared_error: Mean Squared Error. Sum over n of ((predictions[i] - - * ref_predictions[i])^2) / n. - * @param[out] median_abs_error: Median Absolute Error. Median of |predictions[i] - - * ref_predictions[i]| for i in [0, n). - */ -template -void regression_metrics(const T* predictions, - const T* ref_predictions, - int n, - cudaStream_t stream, - double& mean_abs_error, - double& mean_squared_error, - double& median_abs_error) -{ - std::vector mean_errors(2); - std::vector h_sorted_abs_diffs(n); - int thread_cnt = 256; - int block_cnt = raft::ceildiv(n, thread_cnt); - - int array_size = n * sizeof(double); - rmm::device_uvector abs_diffs_array(array_size, stream); - rmm::device_uvector sorted_abs_diffs(array_size, stream); - rmm::device_uvector tmp_sums(2 * sizeof(double), stream); - RAFT_CUDA_TRY(cudaMemsetAsync(tmp_sums.data(), 0, 2 * sizeof(double), stream)); - - reg_metrics_kernel<<>>( - predictions, ref_predictions, n, abs_diffs_array.data(), tmp_sums.data()); - RAFT_CUDA_TRY(cudaGetLastError()); - raft::update_host(&mean_errors[0], tmp_sums.data(), 2, stream); - raft::interruptible::synchronize(stream); - - mean_abs_error = mean_errors[0] / n; - mean_squared_error = mean_errors[1] / n; - - // Compute median error. Sort diffs_array and pick median value - char* temp_storage = nullptr; - size_t temp_storage_bytes; - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage, - temp_storage_bytes, - abs_diffs_array.data(), - sorted_abs_diffs.data(), - n, - 0, - 8 * sizeof(double), - stream)); - rmm::device_uvector temp_storage_v(temp_storage_bytes, stream); - temp_storage = temp_storage_v.data(); - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortKeys((void*)temp_storage, - temp_storage_bytes, - abs_diffs_array.data(), - sorted_abs_diffs.data(), - n, - 0, - 8 * sizeof(double), - stream)); - - raft::update_host(h_sorted_abs_diffs.data(), sorted_abs_diffs.data(), n, stream); - raft::interruptible::synchronize(stream); - - int middle = n / 2; - if (n % 2 == 1) { - median_abs_error = h_sorted_abs_diffs[middle]; - } else { - median_abs_error = (h_sorted_abs_diffs[middle] + h_sorted_abs_diffs[middle - 1]) / 2; - } -} -} // namespace Score -} // namespace MLCommon diff --git a/cpp/src_prims/metrics/silhouette_score.cuh b/cpp/src_prims/metrics/silhouette_score.cuh deleted file mode 100644 index fa23e85722..0000000000 --- a/cpp/src_prims/metrics/silhouette_score.cuh +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace MLCommon { -namespace Metrics { - -/** - * @brief kernel that calculates the average intra-cluster distance for every sample data point and - * updates the cluster distance to max value - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param sampleToClusterSumOfDistances: the pointer to the 2D array that contains the sum of - * distances from every sample to every cluster (nRows x nLabels) - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x - * nLabels) - * @param d_aArray: the pointer to the array of average intra-cluster distances for every sample in - * device memory (1 x nRows) - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param nRows: number of data samples - * @param nLabels: number of Labels - * @param MAX_VAL: DataT specific upper limit - */ -template -__global__ void populateAKernel(DataT* sampleToClusterSumOfDistances, - DataT* binCountArray, - DataT* d_aArray, - LabelT* labels, - int nRows, - int nLabels, - const DataT MAX_VAL) -{ - // getting the current index - int sampleIndex = threadIdx.x + blockIdx.x * blockDim.x; - - if (sampleIndex >= nRows) return; - - // sampleDistanceVector is an array that stores that particular row of the distanceMatrix - DataT* sampleToClusterSumOfDistancesVector = - &sampleToClusterSumOfDistances[sampleIndex * nLabels]; - - LabelT sampleCluster = labels[sampleIndex]; - - int sampleClusterIndex = (int)sampleCluster; - - if (binCountArray[sampleClusterIndex] - 1 <= 0) { - d_aArray[sampleIndex] = -1; - return; - - } - - else { - d_aArray[sampleIndex] = (sampleToClusterSumOfDistancesVector[sampleClusterIndex]) / - (binCountArray[sampleClusterIndex] - 1); - - // modifying the sampleDistanceVector to give sample average distance - sampleToClusterSumOfDistancesVector[sampleClusterIndex] = MAX_VAL; - } -} - -/** - * @brief function to calculate the bincounts of number of samples in every label - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param binCountArray: pointer to the 1D array that contains the count of samples per cluster (1 x - * nLabels) - * @param nRows: number of data samples - * @param nUniqueLabels: number of Labels - * @param workspace: device buffer containing workspace memory - * @param stream: the cuda stream where to launch this kernel - */ -template -void countLabels(LabelT* labels, - DataT* binCountArray, - int nRows, - int nUniqueLabels, - rmm::device_uvector& workspace, - cudaStream_t stream) -{ - int num_levels = nUniqueLabels + 1; - LabelT lower_level = 0; - LabelT upper_level = nUniqueLabels; - size_t temp_storage_bytes = 0; - - rmm::device_uvector countArray(nUniqueLabels, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(nullptr, - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); - - workspace.resize(temp_storage_bytes, stream); - - RAFT_CUDA_TRY(cub::DeviceHistogram::HistogramEven(workspace.data(), - temp_storage_bytes, - labels, - binCountArray, - num_levels, - lower_level, - upper_level, - nRows, - stream)); -} - -/** - * @brief stucture that defines the division Lambda for elementwise op - */ -template -struct DivOp { - HDI DataT operator()(DataT a, int b, int c) - { - if (b == 0) - return ULLONG_MAX; - else - return a / b; - } -}; - -/** - * @brief stucture that defines the elementwise operation to calculate silhouette score using params - * 'a' and 'b' - */ -template -struct SilOp { - HDI DataT operator()(DataT a, DataT b) - { - if (a == 0 && b == 0 || a == b) - return 0; - else if (a == -1) - return 0; - else if (a > b) - return (b - a) / a; - else - return (b - a) / b; - } -}; - -/** - * @brief stucture that defines the reduction Lambda to find minimum between elements - */ -template -struct MinOp { - HDI DataT operator()(DataT a, DataT b) - { - if (a > b) - return b; - else - return a; - } -}; - -/** - * @brief main function that returns the average silhouette score for a given set of data and its - * clusterings - * @tparam DataT: type of the data samples - * @tparam LabelT: type of the labels - * @param X_in: pointer to the input Data samples array (nRows x nCols) - * @param nRows: number of data samples - * @param nCols: number of features - * @param labels: the pointer to the array containing labels for every data sample (1 x nRows) - * @param nLabels: number of Labels - * @param silhouette_scorePerSample: pointer to the array that is optionally taken in as input and - * is populated with the silhouette score for every sample (1 x nRows) - * @param stream: the cuda stream where to launch this kernel - * @param metric: the numerical value that maps to the type of distance metric to be used in the - * calculations - */ -template -DataT silhouette_score( - const raft::handle_t& handle, - DataT* X_in, - int nRows, - int nCols, - LabelT* labels, - int nLabels, - DataT* silhouette_scorePerSample, - cudaStream_t stream, - raft::distance::DistanceType metric = raft::distance::DistanceType::L2Unexpanded) -{ - ASSERT(nLabels >= 2 && nLabels <= (nRows - 1), - "silhouette Score not defined for the given number of labels!"); - - // compute the distance matrix - rmm::device_uvector distanceMatrix(nRows * nRows, stream); - rmm::device_uvector workspace(1, stream); - - ML::Metrics::pairwise_distance( - handle, X_in, X_in, distanceMatrix.data(), nRows, nRows, nCols, metric); - - // deciding on the array of silhouette scores for each dataPoint - rmm::device_uvector silhouette_scoreSamples(0, stream); - DataT* perSampleSilScore = nullptr; - if (silhouette_scorePerSample == nullptr) { - silhouette_scoreSamples.resize(nRows, stream); - perSampleSilScore = silhouette_scoreSamples.data(); - } else { - perSampleSilScore = silhouette_scorePerSample; - } - RAFT_CUDA_TRY(cudaMemsetAsync(perSampleSilScore, 0, nRows * sizeof(DataT), stream)); - - // getting the sample count per cluster - rmm::device_uvector binCountArray(nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(binCountArray.data(), 0, nLabels * sizeof(DataT), stream)); - countLabels(labels, binCountArray.data(), nRows, nLabels, workspace, stream); - - // calculating the sample-cluster-distance-sum-array - rmm::device_uvector sampleToClusterSumOfDistances(nRows * nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - sampleToClusterSumOfDistances.data(), 0, nRows * nLabels * sizeof(DataT), stream)); - raft::linalg::reduce_cols_by_key(distanceMatrix.data(), - labels, - sampleToClusterSumOfDistances.data(), - nRows, - nRows, - nLabels, - stream); - - // creating the a array and b array - rmm::device_uvector d_aArray(nRows, stream); - rmm::device_uvector d_bArray(nRows, stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_aArray.data(), 0, nRows * sizeof(DataT), stream)); - RAFT_CUDA_TRY(cudaMemsetAsync(d_bArray.data(), 0, nRows * sizeof(DataT), stream)); - - // kernel that populates the d_aArray - // kernel configuration - dim3 numThreadsPerBlock(32, 1, 1); - dim3 numBlocks(raft::ceildiv(nRows, numThreadsPerBlock.x), 1, 1); - - // calling the kernel - populateAKernel<<>>( - sampleToClusterSumOfDistances.data(), - binCountArray.data(), - d_aArray.data(), - labels, - nRows, - nLabels, - std::numeric_limits::max()); - - // elementwise dividing by bincounts - rmm::device_uvector averageDistanceBetweenSampleAndCluster(nRows * nLabels, stream); - RAFT_CUDA_TRY(cudaMemsetAsync( - averageDistanceBetweenSampleAndCluster.data(), 0, nRows * nLabels * sizeof(DataT), stream)); - - raft::linalg::matrixVectorOp>(averageDistanceBetweenSampleAndCluster.data(), - sampleToClusterSumOfDistances.data(), - binCountArray.data(), - binCountArray.data(), - nLabels, - nRows, - true, - true, - DivOp(), - stream); - - // calculating row-wise minimum - raft::linalg::reduce, MinOp>( - d_bArray.data(), - averageDistanceBetweenSampleAndCluster.data(), - nLabels, - nRows, - std::numeric_limits::max(), - true, - true, - stream, - false, - raft::Nop(), - MinOp()); - - // calculating the silhouette score per sample using the d_aArray and d_bArray - raft::linalg::binaryOp>( - perSampleSilScore, d_aArray.data(), d_bArray.data(), nRows, SilOp(), stream); - - // calculating the sum of all the silhouette score - rmm::device_scalar d_avgSilhouetteScore(stream); - RAFT_CUDA_TRY(cudaMemsetAsync(d_avgSilhouetteScore.data(), 0, sizeof(DataT), stream)); - - raft::linalg::mapThenSumReduce>(d_avgSilhouetteScore.data(), - nRows, - raft::Nop(), - stream, - perSampleSilScore, - perSampleSilScore); - - DataT avgSilhouetteScore = d_avgSilhouetteScore.value(stream); - - handle.sync_stream(stream); - - avgSilhouetteScore /= nRows; - - return avgSilhouetteScore; -} - -}; // namespace Metrics -}; // namespace MLCommon diff --git a/cpp/src_prims/metrics/trustworthiness_score.cuh b/cpp/src_prims/metrics/trustworthiness_score.cuh deleted file mode 100644 index c476cd209d..0000000000 --- a/cpp/src_prims/metrics/trustworthiness_score.cuh +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2021-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include -#include -#include -#include -#include -#include -#include - -#define N_THREADS 512 - -namespace MLCommon { -namespace Score { - -/** - * @brief Build the lookup table - * @param[out] lookup_table: Lookup table giving nearest neighbor order - * of pairwise distance calculations given sample index - * @param[in] X_ind: Sorted indexes of pairwise distance calculations of X - * @param n: Number of samples - * @param work: Number of elements to consider - */ -__global__ void build_lookup_table(int* lookup_table, const int* X_ind, int n, int work) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= work) return; - - int sample_idx = i / n; - int nn_idx = i % n; - - int idx = X_ind[i]; - lookup_table[(sample_idx * n) + idx] = nn_idx; -} - -/** - * @brief Compute a the rank of trustworthiness score - * @param[out] rank: Resulting rank - * @param[out] lookup_table: Lookup table giving nearest neighbor order - * of pairwise distance calculations given sample index - * @param[in] emb_ind: Indexes of KNN on embeddings - * @param n: Number of samples - * @param n_neighbors: Number of neighbors considered by trustworthiness score - * @param work: Batch to consider (to do it at once use n * n_neighbors) - */ -template -__global__ void compute_rank(double* rank, - const int* lookup_table, - const knn_index_t* emb_ind, - int n, - int n_neighbors, - int work) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i >= work) return; - - int sample_idx = i / n_neighbors; - - knn_index_t emb_nn_ind = emb_ind[i]; - - int r = lookup_table[(sample_idx * n) + emb_nn_ind]; - int tmp = r - n_neighbors + 1; - if (tmp > 0) raft::myAtomicAdd(rank, tmp); -} - -/** - * @brief Compute a kNN and returns the indices of the nearest neighbors - * @param h Raft handle - * @param[in] input Input matrix containing the dataset - * @param n Number of samples - * @param d Number of features - * @param n_neighbors number of neighbors - * @param[out] indices KNN indexes - * @param[out] distances KNN distances - */ -template -void run_knn(const raft::handle_t& h, - math_t* input, - int n, - int d, - int n_neighbors, - int64_t* indices, - math_t* distances) -{ - std::vector ptrs(1); - std::vector sizes(1); - ptrs[0] = input; - sizes[0] = n; - - raft::spatial::knn::brute_force_knn(h, - ptrs, - sizes, - d, - input, - n, - indices, - distances, - n_neighbors, - true, - true, - nullptr, - distance_type); -} - -/** - * @brief Compute the trustworthiness score - * @param h Raft handle - * @param X[in]: Data in original dimension - * @param X_embedded[in]: Data in target dimension (embedding) - * @param n: Number of samples - * @param m: Number of features in high/original dimension - * @param d: Number of features in low/embedded dimension - * @param n_neighbors Number of neighbors considered by trustworthiness score - * @param batchSize Batch size - * @return Trustworthiness score - */ -template -double trustworthiness_score(const raft::handle_t& h, - const math_t* X, - math_t* X_embedded, - int n, - int m, - int d, - int n_neighbors, - int batchSize = 512) -{ - cudaStream_t stream = h.get_stream(); - - const int KNN_ALLOC = n * (n_neighbors + 1); - rmm::device_uvector emb_ind(KNN_ALLOC, stream); - rmm::device_uvector emb_dist(KNN_ALLOC, stream); - - run_knn(h, X_embedded, n, d, n_neighbors + 1, emb_ind.data(), emb_dist.data()); - - const int PAIRWISE_ALLOC = batchSize * n; - rmm::device_uvector X_ind(PAIRWISE_ALLOC, stream); - rmm::device_uvector X_dist(PAIRWISE_ALLOC, stream); - rmm::device_uvector lookup_table(PAIRWISE_ALLOC, stream); - - double t = 0.0; - rmm::device_scalar t_dbuf(stream); - - int toDo = n; - while (toDo > 0) { - int curBatchSize = min(toDo, batchSize); - - // Takes at most batchSize vectors at a time - ML::Metrics::pairwise_distance( - h, &X[(n - toDo) * m], X, X_dist.data(), curBatchSize, n, m, distance_type); - - size_t colSortWorkspaceSize = 0; - bool bAllocWorkspace = false; - - MLCommon::Selection::sortColumnsPerRow(X_dist.data(), - X_ind.data(), - curBatchSize, - n, - bAllocWorkspace, - nullptr, - colSortWorkspaceSize, - stream); - - if (bAllocWorkspace) { - rmm::device_uvector sortColsWorkspace(colSortWorkspaceSize, stream); - - MLCommon::Selection::sortColumnsPerRow(X_dist.data(), - X_ind.data(), - curBatchSize, - n, - bAllocWorkspace, - sortColsWorkspace.data(), - colSortWorkspaceSize, - stream); - } - - int work = curBatchSize * n; - int n_blocks = raft::ceildiv(work, N_THREADS); - build_lookup_table<<>>( - lookup_table.data(), X_ind.data(), n, work); - - RAFT_CUDA_TRY(cudaMemsetAsync(t_dbuf.data(), 0, sizeof(double), stream)); - - work = curBatchSize * (n_neighbors + 1); - n_blocks = raft::ceildiv(work, N_THREADS); - compute_rank<<>>( - t_dbuf.data(), - lookup_table.data(), - &emb_ind.data()[(n - toDo) * (n_neighbors + 1)], - n, - n_neighbors + 1, - work); - RAFT_CUDA_TRY(cudaPeekAtLastError()); - - t += t_dbuf.value(stream); - - toDo -= curBatchSize; - } - - t = 1.0 - ((2.0 / ((n * n_neighbors) * ((2.0 * n) - (3.0 * n_neighbors) - 1.0))) * t); - - return t; -} -} // namespace Score -} // namespace MLCommon diff --git a/cpp/src_prims/metrics/v_measure.cuh b/cpp/src_prims/metrics/v_measure.cuh deleted file mode 100644 index e0396c5702..0000000000 --- a/cpp/src_prims/metrics/v_measure.cuh +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * @file v_measure.cuh - */ - -#include "homogeneity_score.cuh" - -namespace MLCommon { - -namespace Metrics { - -/** - * @brief Function to calculate the v-measure between two clusters - * - * @param truthClusterArray: the array of truth classes of type T - * @param predClusterArray: the array of predicted classes of type T - * @param size: the size of the data points of type int - * @param lowerLabelRange: the lower bound of the range of labels - * @param upperLabelRange: the upper bound of the range of labels - * @param stream: the cudaStream object - * @param beta: v_measure parameter - */ -template -double v_measure(const T* truthClusterArray, - const T* predClusterArray, - int size, - T lowerLabelRange, - T upperLabelRange, - cudaStream_t stream, - double beta = 1.0) -{ - double computedHomogeity, computedCompleteness, computedVMeasure; - - computedHomogeity = MLCommon::Metrics::homogeneity_score( - truthClusterArray, predClusterArray, size, lowerLabelRange, upperLabelRange, stream); - computedCompleteness = MLCommon::Metrics::homogeneity_score( - predClusterArray, truthClusterArray, size, lowerLabelRange, upperLabelRange, stream); - - if (computedCompleteness + computedHomogeity == 0.0) - computedVMeasure = 0.0; - else - computedVMeasure = ((1 + beta) * computedHomogeity * computedCompleteness / - (beta * computedHomogeity + computedCompleteness)); - - return computedVMeasure; -} - -}; // end namespace Metrics -}; // end namespace MLCommon diff --git a/cpp/src_prims/selection/columnWiseSort.cuh b/cpp/src_prims/selection/columnWiseSort.cuh deleted file mode 100644 index 6db4f3bf7f..0000000000 --- a/cpp/src_prims/selection/columnWiseSort.cuh +++ /dev/null @@ -1,346 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include -#include - -#include -#include -#include - -#define INST_BLOCK_SORT(keyIn, keyOut, valueInOut, rows, columns, blockSize, elemPT, stream) \ - devKeyValSortColumnPerRow<<>>( \ - keyIn, keyOut, valueInOut, rows, columns, std::numeric_limits::max()) - -namespace MLCommon { -namespace Selection { - -template -struct TemplateChecker { - enum { - IsValid = (std::is_same::value && BLOCK_SIZE <= 1024) || - (std::is_same::value && BLOCK_SIZE <= 1024) || - (std::is_same::value && BLOCK_SIZE <= 1024) || - (std::is_same::value && BLOCK_SIZE <= 512) - }; -}; - -template -struct SmemPerBlock { - typedef cub::BlockLoad - BlockLoadTypeKey; - - typedef cub::BlockRadixSort BlockRadixSortType; - - union TempStorage { - typename BlockLoadTypeKey::TempStorage keyLoad; - typename BlockRadixSortType::TempStorage sort; - } tempStorage; -}; - -template -__global__ void devLayoutIdx(InType* in, int n_cols, int totalElements) -{ - int idx = threadIdx.x + blockDim.x * blockIdx.x; - int n = n_cols; - - if (idx < totalElements) { in[idx] = idx % n; } -} - -template -__global__ void devOffsetKernel(T* in, T value, int n_times) -{ - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx < n_times) in[idx] = idx * value; -} - -// block level radix sort - can only sort as much data we can fit within shared memory -template < - typename InType, - typename OutType, - int BLOCK_SIZE, - int ITEMS_PER_THREAD, - typename std::enable_if::IsValid, InType>::type* = nullptr> -__global__ void __launch_bounds__(1024, 1) devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) -{ - typedef cub::BlockLoad - BlockLoadTypeKey; - - typedef cub::BlockRadixSort BlockRadixSortType; - - __shared__ SmemPerBlock tmpSmem; - - InType threadKeys[ITEMS_PER_THREAD]; - OutType threadValues[ITEMS_PER_THREAD]; - - int blockOffset = blockIdx.x * n_cols; - BlockLoadTypeKey(tmpSmem.tempStorage.keyLoad) - .Load(inputKeys + blockOffset, threadKeys, n_cols, MAX_VALUE); - - OutType idxBase = threadIdx.x * ITEMS_PER_THREAD; - for (int i = 0; i < ITEMS_PER_THREAD; i++) { - OutType eId = idxBase + (OutType)i; - if (eId < n_cols) - threadValues[i] = eId; - else - threadValues[i] = MAX_VALUE; - } - - __syncthreads(); - - BlockRadixSortType(tmpSmem.tempStorage.sort).SortBlockedToStriped(threadKeys, threadValues); - - // storing index values back (not keys) - cub::StoreDirectStriped(threadIdx.x, inputVals + blockOffset, threadValues, n_cols); - - if (outputKeys) { - cub::StoreDirectStriped(threadIdx.x, outputKeys + blockOffset, threadKeys, n_cols); - } -} - -template < - typename InType, - typename OutType, - int BLOCK_SIZE, - int ITEMS_PER_THREAD, - typename std::enable_if::IsValid), InType>::type* = nullptr> -__global__ void devKeyValSortColumnPerRow(const InType* inputKeys, - InType* outputKeys, - OutType* inputVals, - int n_rows, - int n_cols, - InType MAX_VALUE) -{ - // place holder function - // so that compiler unrolls for all template types successfully -} - -// helper function to layout values (index's) for key-value sort -template -cudaError_t layoutIdx(OutType* in, int n_rows, int n_columns, cudaStream_t stream) -{ - int totalElements = n_rows * n_columns; - dim3 block(256); - dim3 grid((totalElements + block.x - 1) / block.x); - devLayoutIdx<<>>(in, n_columns, totalElements); - return cudaGetLastError(); -} - -// helper function to layout offsets for rows for DeviceSegmentedRadixSort -template -cudaError_t layoutSortOffset(T* in, T value, int n_times, cudaStream_t stream) -{ - dim3 block(128); - dim3 grid((n_times + block.x - 1) / block.x); - devOffsetKernel<<>>(in, value, n_times); - return cudaGetLastError(); -} - -/** - * @brief sort columns within each row of row-major input matrix and return sorted indexes - * modelled as key-value sort with key being input matrix and value being index of values - * @param in: input matrix - * @param out: output value(index) matrix - * @param n_rows: number rows of input matrix - * @param n_cols: number columns of input matrix - * @param bAllocWorkspace: check returned value, if true allocate workspace passed in workspaceSize - * @param workspacePtr: pointer to workspace memory - * @param workspaceSize: Size of workspace to be allocated - * @param stream: cuda stream to execute prim on - * @param sortedKeys: Optional, output matrix for sorted keys (input) - */ -template -void sortColumnsPerRow(const InType* in, - OutType* out, - int n_rows, - int n_columns, - bool& bAllocWorkspace, - void* workspacePtr, - size_t& workspaceSize, - cudaStream_t stream, - InType* sortedKeys = nullptr) -{ - // assume non-square row-major matrices - // current use-case: KNN, trustworthiness scores - // output : either sorted indices or sorted indices and input values - // future : this prim can be modified to be more generic and serve as a way to sort column entries - // per row - // i.e. another output format: sorted values only - - int totalElements = n_rows * n_columns; - size_t perElementSmemUsage = sizeof(InType) + sizeof(OutType); - size_t memAlignWidth = 256; - - // @ToDo: Figure out dynamic shared memory for block sort kernel - better for volta and beyond - // int currDevice = 0, smemLimit = 0; - // RAFT_CUDA_TRY(cudaGetDevice(&currDevice)); - // RAFT_CUDA_TRY(cudaDeviceGetAttribute(&smemLimit, cudaDevAttrMaxSharedMemoryPerBlock, - // currDevice)); size_t maxElementsForBlockSort = smemLimit / perElementSmemUsage; - - // for 48KB smem/block, can fit in 6144 4byte key-value pair - // assuming key-value sort for now - smem computation will change for value only sort - // dtype being size of key-value pair - std::map dtypeToColumnMap = {{4, 12288}, // short + short - {8, 12288}, // float/int + int/float - {12, 6144}, // double + int/float - {16, 6144}}; // double + double - - if (dtypeToColumnMap.count(perElementSmemUsage) != 0 && - n_columns <= dtypeToColumnMap[perElementSmemUsage]) { - // more elements per thread --> more register pressure - // 512(blockSize) * 8 elements per thread = 71 register / thread - - // instantiate some kernel combinations - if (n_columns <= 512) - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 4, stream); - else if (n_columns > 512 && n_columns <= 1024) - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 128, 8, stream); - else if (n_columns > 1024 && n_columns <= 3072) - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 6, stream); - else if (n_columns > 3072 && n_columns <= 4096) - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 8, stream); - else if (n_columns > 4096 && n_columns <= 6144) - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 512, 12, stream); - else - INST_BLOCK_SORT(in, sortedKeys, out, n_rows, n_columns, 1024, 12, stream); - } else if (n_columns <= (1 << 18) && n_rows > 1) { - // device Segmented radix sort - // 2^18 column cap to restrict size of workspace ~512 MB - // will give better perf than below deviceWide Sort for even larger dims - int numSegments = n_rows + 1; - - // need auxillary storage: cub sorting + keys (if user not passing) + - // staging for values out + segment partition - if (workspaceSize == 0 || !workspacePtr) { - OutType* tmpValIn = nullptr; - int* tmpOffsetBuffer = nullptr; - - // first call is to get size of workspace - RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr, - workspaceSize, - in, - sortedKeys, - tmpValIn, - out, - totalElements, - numSegments, - tmpOffsetBuffer, - tmpOffsetBuffer + 1)); - bAllocWorkspace = true; - // more staging space for temp output of keys - if (!sortedKeys) - workspaceSize += raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth); - - // value in KV pair need to be passed in, out buffer is separate - workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth); - - // for segment offsets - workspaceSize += raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth); - } else { - size_t workspaceOffset = 0; - - if (!sortedKeys) { - sortedKeys = reinterpret_cast(workspacePtr); - workspaceOffset = raft::alignTo(sizeof(InType) * (size_t)totalElements, memAlignWidth); - workspacePtr = (void*)((size_t)workspacePtr + workspaceOffset); - } - - OutType* dValuesIn = reinterpret_cast(workspacePtr); - workspaceOffset = raft::alignTo(sizeof(OutType) * (size_t)totalElements, memAlignWidth); - workspacePtr = (void*)((size_t)workspacePtr + workspaceOffset); - - int* dSegmentOffsets = reinterpret_cast(workspacePtr); - workspaceOffset = raft::alignTo(sizeof(int) * (size_t)numSegments, memAlignWidth); - workspacePtr = (void*)((size_t)workspacePtr + workspaceOffset); - - // layout idx - RAFT_CUDA_TRY(layoutIdx(dValuesIn, n_rows, n_columns, stream)); - - // layout segment lengths - spread out column length - RAFT_CUDA_TRY(layoutSortOffset(dSegmentOffsets, n_columns, numSegments, stream)); - - RAFT_CUDA_TRY(cub::DeviceSegmentedRadixSort::SortPairs(workspacePtr, - workspaceSize, - in, - sortedKeys, - dValuesIn, - out, - totalElements, - numSegments, - dSegmentOffsets, - dSegmentOffsets + 1, - 0, - sizeof(InType) * 8, - stream)); - } - } else { - // batched per row device wide sort - if (workspaceSize == 0 || !workspacePtr) { - OutType* tmpValIn = nullptr; - - // first call is to get size of workspace - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs( - workspacePtr, workspaceSize, in, sortedKeys, tmpValIn, out, n_columns)); - bAllocWorkspace = true; - - if (!sortedKeys) - workspaceSize += raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth); - - workspaceSize += raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth); - } else { - size_t workspaceOffset = 0; - bool userKeyOutputBuffer = true; - - if (!sortedKeys) { - userKeyOutputBuffer = false; - sortedKeys = reinterpret_cast(workspacePtr); - workspaceOffset = raft::alignTo(sizeof(InType) * (size_t)n_columns, memAlignWidth); - workspacePtr = (void*)((size_t)workspacePtr + workspaceOffset); - } - - OutType* dValuesIn = reinterpret_cast(workspacePtr); - workspaceOffset = raft::alignTo(sizeof(OutType) * (size_t)n_columns, memAlignWidth); - workspacePtr = (void*)((size_t)workspacePtr + workspaceOffset); - - // layout idx - RAFT_CUDA_TRY(layoutIdx(dValuesIn, 1, n_columns, stream)); - - for (int i = 0; i < n_rows; i++) { - InType* rowIn = - reinterpret_cast((size_t)in + (i * sizeof(InType) * (size_t)n_columns)); - OutType* rowOut = - reinterpret_cast((size_t)out + (i * sizeof(OutType) * (size_t)n_columns)); - - RAFT_CUDA_TRY(cub::DeviceRadixSort::SortPairs( - workspacePtr, workspaceSize, rowIn, sortedKeys, dValuesIn, rowOut, n_columns)); - - if (userKeyOutputBuffer) - sortedKeys = - reinterpret_cast((size_t)sortedKeys + sizeof(InType) * (size_t)n_columns); - } - } - } -} -}; // end namespace Selection -}; // end namespace MLCommon diff --git a/cpp/src_prims/selection/haversine_knn.cuh b/cpp/src_prims/selection/haversine_knn.cuh deleted file mode 100644 index c531c3577b..0000000000 --- a/cpp/src_prims/selection/haversine_knn.cuh +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) 2019-2021, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include -#include - -namespace raft { -namespace selection { - -template -DI value_t compute_haversine(value_t x1, value_t y1, value_t x2, value_t y2) -{ - value_t sin_0 = sin(0.5 * (x1 - y1)); - value_t sin_1 = sin(0.5 * (x2 - y2)); - value_t rdist = sin_0 * sin_0 + cos(x1) * cos(y1) * sin_1 * sin_1; - - return 2 * asin(sqrt(rdist)); -} - -/** - * @tparam value_idx data type of indices - * @tparam value_t data type of values and distances - * @tparam warp_q - * @tparam thread_q - * @tparam tpb - * @param[out] out_inds output indices - * @param[out] out_dists output distances - * @param[in] index index array - * @param[in] query query array - * @param[in] n_index_rows number of rows in index array - * @param[in] k number of closest neighbors to return - */ -template -__global__ void haversine_knn_kernel(value_idx* out_inds, - value_t* out_dists, - const value_t* index, - const value_t* query, - size_t n_index_rows, - int k) -{ - constexpr int kNumWarps = tpb / faiss::gpu::kWarpSize; - - __shared__ value_t smemK[kNumWarps * warp_q]; - __shared__ value_idx smemV[kNumWarps * warp_q]; - - faiss::gpu:: - BlockSelect, warp_q, thread_q, tpb> - heap(faiss::gpu::Limits::getMax(), -1, smemK, smemV, k); - - // Grid is exactly sized to rows available - int limit = faiss::gpu::utils::roundDown(n_index_rows, faiss::gpu::kWarpSize); - - const value_t* query_ptr = query + (blockIdx.x * 2); - value_t x1 = query_ptr[0]; - value_t x2 = query_ptr[1]; - - int i = threadIdx.x; - - for (; i < limit; i += tpb) { - const value_t* idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; - - value_t dist = compute_haversine(x1, y1, x2, y2); - - heap.add(dist, i); - } - - // Handle last remainder fraction of a warp of elements - if (i < n_index_rows) { - const value_t* idx_ptr = index + (i * 2); - value_t y1 = idx_ptr[0]; - value_t y2 = idx_ptr[1]; - - value_t dist = compute_haversine(x1, y1, x2, y2); - - heap.addThreadQ(dist, i); - } - - heap.reduce(); - - for (int i = threadIdx.x; i < k; i += tpb) { - out_dists[blockIdx.x * k + i] = smemK[i]; - out_inds[blockIdx.x * k + i] = smemV[i]; - } -} - -/** - * Conmpute the k-nearest neighbors using the Haversine - * (great circle arc) distance. Input is assumed to have - * 2 dimensions (latitude, longitude) in radians. - - * @tparam value_idx - * @tparam value_t - * @param[out] out_inds output indices array on device (size n_query_rows * k) - * @param[out] out_dists output dists array on device (size n_query_rows * k) - * @param[in] index input index array on device (size n_index_rows * 2) - * @param[in] query input query array on device (size n_query_rows * 2) - * @param[in] n_index_rows number of rows in index array - * @param[in] n_query_rows number of rows in query array - * @param[in] k number of closest neighbors to return - * @param[in] stream stream to order kernel launch - */ -template -void haversine_knn(value_idx* out_inds, - value_t* out_dists, - const value_t* index, - const value_t* query, - size_t n_index_rows, - size_t n_query_rows, - int k, - cudaStream_t stream) -{ - haversine_knn_kernel<<>>( - out_inds, out_dists, index, query, n_index_rows, k); -} - -}; // namespace selection -}; // namespace raft diff --git a/cpp/src_prims/selection/knn.cuh b/cpp/src_prims/selection/knn.cuh deleted file mode 100644 index bdba95d082..0000000000 --- a/cpp/src_prims/selection/knn.cuh +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "haversine_knn.cuh" -#include "processing.cuh" - -#include