diff --git a/cpp/include/cuml/cluster/hdbscan.hpp b/cpp/include/cuml/cluster/hdbscan.hpp index 9fe7d3659a..9dd934fd2c 100644 --- a/cpp/include/cuml/cluster/hdbscan.hpp +++ b/cpp/include/cuml/cluster/hdbscan.hpp @@ -459,7 +459,7 @@ void compute_all_points_membership_vectors( const float* X, raft::distance::DistanceType metric, float* membership_vec, - int batch_size); + size_t batch_size = 4096); void compute_membership_vector(const raft::handle_t& handle, HDBSCAN::Common::CondensedHierarchy& condensed_tree, @@ -470,7 +470,7 @@ void compute_membership_vector(const raft::handle_t& handle, int min_samples, raft::distance::DistanceType metric, float* membership_vec, - int batch_size); + size_t batch_size = 4096); void out_of_sample_predict(const raft::handle_t& handle, HDBSCAN::Common::CondensedHierarchy& condensed_tree, diff --git a/cpp/src/hdbscan/detail/soft_clustering.cuh b/cpp/src/hdbscan/detail/soft_clustering.cuh index 4f13de35b8..10085d0e49 100644 --- a/cpp/src/hdbscan/detail/soft_clustering.cuh +++ b/cpp/src/hdbscan/detail/soft_clustering.cuh @@ -67,7 +67,7 @@ void dist_membership_vector(const raft::handle_t& handle, value_idx* exemplar_label_offsets, value_t* dist_membership_vec, raft::distance::DistanceType metric, - int batch_size, + size_t batch_size, bool softmax = false) { auto stream = handle.get_stream(); @@ -82,16 +82,11 @@ void dist_membership_vector(const raft::handle_t& handle, // compute the number of batches based on the batch size value_idx n_batches; - if (batch_size == 0) { - n_batches = 1; - batch_size = n_queries; - } - else { - n_batches = raft::ceildiv((int)n_queries, (int)batch_size); - } + n_batches = raft::ceildiv((int)n_queries, (int)batch_size); + for(value_idx bid = 0; bid < n_batches; bid++) { value_idx batch_offset = bid * batch_size; - value_idx samples_per_batch = min(batch_size, (int)n_queries - batch_offset); + value_idx samples_per_batch = min((value_idx)batch_size, (value_idx)n_queries - batch_offset); rmm::device_uvector dist(samples_per_batch * n_exemplars, stream); // compute the distances using raft API @@ -392,14 +387,16 @@ void all_points_membership_vectors(const raft::handle_t& handle, const value_t* X, raft::distance::DistanceType metric, value_t* membership_vec, - value_idx batch_size) + size_t batch_size) { auto stream = handle.get_stream(); auto exec_policy = handle.get_thrust_policy(); size_t m = prediction_data.n_rows; size_t n = prediction_data.n_cols; - RAFT_EXPECTS(0 <= batch_size && batch_size <= m, "Invalid batch_size. batch_size should be >= 0 and <= the number of samples in the training data"); + + if (batch_size > m) batch_size = m; + RAFT_EXPECTS(0 < batch_size && batch_size <= m, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data"); auto parents = condensed_tree.get_parents(); auto children = condensed_tree.get_children(); @@ -507,11 +504,10 @@ void membership_vector(const raft::handle_t& handle, raft::distance::DistanceType metric, int min_samples, value_t* membership_vec, - value_idx batch_size) + size_t batch_size) { RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded, "Currently only L2 expanded distance is supported"); - RAFT_EXPECTS(0 <= batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be >= 0 and <= the number of points to predict"); auto stream = handle.get_stream(); auto exec_policy = handle.get_thrust_policy(); @@ -525,6 +521,9 @@ void membership_vector(const raft::handle_t& handle, value_idx n_exemplars = prediction_data.get_n_exemplars(); value_t* lambdas = condensed_tree.get_lambdas(); + if (batch_size > n_prediction_points) batch_size = n_prediction_points; + RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data"); + rmm::device_uvector dist_membership_vec(n_prediction_points * n_selected_clusters, stream); diff --git a/cpp/src/hdbscan/hdbscan.cu b/cpp/src/hdbscan/hdbscan.cu index ed50c3aa80..66476457bb 100644 --- a/cpp/src/hdbscan/hdbscan.cu +++ b/cpp/src/hdbscan/hdbscan.cu @@ -93,7 +93,7 @@ void compute_all_points_membership_vectors( const float* X, raft::distance::DistanceType metric, float* membership_vec, - int batch_size) + size_t batch_size) { HDBSCAN::detail::Predict::all_points_membership_vectors( handle, condensed_tree, prediction_data, X, metric, membership_vec, batch_size); @@ -108,7 +108,7 @@ void compute_membership_vector(const raft::handle_t& handle, int min_samples, raft::distance::DistanceType metric, float* membership_vec, - int batch_size) + size_t batch_size) { // Note that (min_samples+1) is parsed to the approximate_predict function. This was done for the // core distance computation to consistent with Scikit learn Contrib. diff --git a/cpp/test/sg/hdbscan_test.cu b/cpp/test/sg/hdbscan_test.cu index 115090af4b..97b51bc181 100644 --- a/cpp/test/sg/hdbscan_test.cu +++ b/cpp/test/sg/hdbscan_test.cu @@ -25,10 +25,7 @@ #include #include #include -#include #include -#include -#include #include #include @@ -460,8 +457,7 @@ class AllPointsMembershipVectorsTest prediction_data_, data.data(), raft::distance::DistanceType::L2SqrtExpanded, - membership_vec.data(), - 0); + membership_vec.data()); ASSERT_TRUE(MLCommon::devArrMatch(membership_vec.data(), params.expected_probabilities.data(), @@ -755,8 +751,7 @@ class MembershipVectorTest : public ::testing::TestWithParam clusterer.condensed_tree_ptr cdef handle_t* handle_ = clusterer.handle.getHandle() + compute_all_points_membership_vectors(handle_[0], deref(condensed_tree), deref(prediction_data_), @@ -229,7 +232,7 @@ def all_points_membership_vectors(clusterer, batch_size=0): clusterer.n_clusters_)) -def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype=True): +def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dtype=True): """Predict soft cluster membership. The result produces a vector for each point in ``points_to_predict`` that gives a probability that the given point is a member of a cluster for each of the selected clusters @@ -247,11 +250,13 @@ def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype= have the same dimensionality as the original dataset over which clusterer was fit. - batch_size : int, optional, default=0 + batch_size : int, optional, default=min(4096, n_points_to_predict) Lowers memory requirement by computing distance-based membership in smaller batches of points in the training data. Batch size of 0 uses all of the training points, batch size of 1000 computes distances for - 1000 points at a time. + 1000 points at a time. The default batch_size is 4096. If the number + of rows in the original dataset is less than 4096, this defaults to + the number of rows. Returns ------- diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 16bf6c0c40..780404bf42 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -531,7 +531,7 @@ def test_hdbscan_plots(): @pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5]) @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("batch_size", [0, 128]) +@pytest.mark.parametrize("batch_size", [128, 1000]) def test_all_points_membership_vectors_blobs( nrows, ncols, @@ -593,7 +593,7 @@ def test_all_points_membership_vectors_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("batch_size", [0, 128]) +@pytest.mark.parametrize("batch_size", [128, 1000]) def test_all_points_membership_vectors_moons( nrows, min_samples, @@ -650,7 +650,7 @@ def test_all_points_membership_vectors_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("batch_size", [0, 128]) +@pytest.mark.parametrize("batch_size", [128, 1000]) def test_all_points_membership_vectors_circles( nrows, min_samples, @@ -981,7 +981,7 @@ def test_approximate_predict_digits( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("allow_single_cluster", [True, False]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) -@pytest.mark.parametrize("batch_size", [0, 128]) +@pytest.mark.parametrize("batch_size", [128]) def test_membership_vector_blobs( nrows, n_points_to_predict, @@ -1057,7 +1057,7 @@ def test_membership_vector_blobs( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("batch_size", [0, 16]) +@pytest.mark.parametrize("batch_size", [16]) def test_membership_vector_moons( nrows, n_points_to_predict, @@ -1121,7 +1121,7 @@ def test_membership_vector_moons( @pytest.mark.parametrize("max_cluster_size", [0]) @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"]) @pytest.mark.parametrize("connectivity", ["knn"]) -@pytest.mark.parametrize("batch_size", [0, 16]) +@pytest.mark.parametrize("batch_size", [16]) def test_membership_vector_circles( nrows, n_points_to_predict,