rapidsai · rapids-bot · Apr 6, 2023 · Feb 18, 2023 · Feb 22, 2023 · Feb 22, 2023
@@ -459,7 +459,7 @@ void compute_all_points_membership_vectors(
   const float* X,
   raft::distance::DistanceType metric,
   float* membership_vec,
-  int batch_size);
+  size_t batch_size = 4096);
 
 void compute_membership_vector(const raft::handle_t& handle,
                                HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
@@ -470,7 +470,7 @@ void compute_membership_vector(const raft::handle_t& handle,
                                int min_samples,
                                raft::distance::DistanceType metric,
                                float* membership_vec,
-                               int batch_size);
+                               size_t batch_size = 4096);
 
 void out_of_sample_predict(const raft::handle_t& handle,
                            HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,

@@ -67,7 +67,7 @@ void dist_membership_vector(const raft::handle_t& handle,
                             value_idx* exemplar_label_offsets,
                             value_t* dist_membership_vec,
                             raft::distance::DistanceType metric,
-                            int batch_size,
+                            size_t batch_size,
                             bool softmax = false)
 {
   auto stream      = handle.get_stream();
@@ -82,16 +82,11 @@ void dist_membership_vector(const raft::handle_t& handle,
   // compute the number of batches based on the batch size
   value_idx n_batches;
 
-  if (batch_size == 0) {
-    n_batches = 1;
-    batch_size = n_queries;
-  }
-  else {
-    n_batches = raft::ceildiv((int)n_queries, (int)batch_size);
-  }
+  n_batches = raft::ceildiv((int)n_queries, (int)batch_size);
+
   for(value_idx bid = 0; bid < n_batches; bid++) {
     value_idx batch_offset = bid * batch_size;
-    value_idx samples_per_batch = min(batch_size, (int)n_queries - batch_offset);
+    value_idx samples_per_batch = min((value_idx)batch_size, (value_idx)n_queries - batch_offset);
     rmm::device_uvector<value_t> dist(samples_per_batch * n_exemplars, stream);
 
      // compute the distances using raft API
@@ -392,14 +387,16 @@ void all_points_membership_vectors(const raft::handle_t& handle,
                                    const value_t* X,
                                    raft::distance::DistanceType metric,
                                    value_t* membership_vec,
-                                   value_idx batch_size)
+                                   size_t batch_size)
 {
   auto stream      = handle.get_stream();
   auto exec_policy = handle.get_thrust_policy();
 
   size_t m = prediction_data.n_rows;
   size_t n = prediction_data.n_cols;
-  RAFT_EXPECTS(0 <= batch_size && batch_size <= m, "Invalid batch_size. batch_size should be >= 0 and <= the number of samples in the training data");
+
+  if (batch_size > m) batch_size = m;
+  RAFT_EXPECTS(0 < batch_size && batch_size <= m, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");
 
   auto parents    = condensed_tree.get_parents();
   auto children   = condensed_tree.get_children();
@@ -507,11 +504,10 @@ void membership_vector(const raft::handle_t& handle,
                        raft::distance::DistanceType metric,
                        int min_samples,
                        value_t* membership_vec,
-                       value_idx batch_size)
+                       size_t batch_size)
 {
   RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
                "Currently only L2 expanded distance is supported");
-  RAFT_EXPECTS(0 <= batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be >= 0 and <= the number of points to predict");
 
   auto stream      = handle.get_stream();
   auto exec_policy = handle.get_thrust_policy();
@@ -525,6 +521,9 @@ void membership_vector(const raft::handle_t& handle,
   value_idx n_exemplars          = prediction_data.get_n_exemplars();
   value_t* lambdas               = condensed_tree.get_lambdas();
 
+  if (batch_size > n_prediction_points) batch_size = n_prediction_points;
+  RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");
+
   rmm::device_uvector<value_t> dist_membership_vec(n_prediction_points * n_selected_clusters,
                                                    stream);
 

@@ -93,7 +93,7 @@ void compute_all_points_membership_vectors(
   const float* X,
   raft::distance::DistanceType metric,
   float* membership_vec,
-  int batch_size)
+  size_t batch_size)
 {
   HDBSCAN::detail::Predict::all_points_membership_vectors(
     handle, condensed_tree, prediction_data, X, metric, membership_vec, batch_size);
@@ -108,7 +108,7 @@ void compute_membership_vector(const raft::handle_t& handle,
                                int min_samples,
                                raft::distance::DistanceType metric,
                                float* membership_vec,
-                               int batch_size)
+                               size_t batch_size)
 {
   // Note that (min_samples+1) is parsed to the approximate_predict function. This was done for the
   // core distance computation to consistent with Scikit learn Contrib.

@@ -25,10 +25,7 @@
 #include <cuml/cluster/hdbscan.hpp>
 #include <hdbscan/detail/condense.cuh>
 #include <hdbscan/detail/extract.cuh>
-#include <hdbscan/detail/predict.cuh>
 #include <hdbscan/detail/reachability.cuh>
-#include <hdbscan/detail/soft_clustering.cuh>
-#include <hdbscan/detail/utils.h>
 
 #include <raft/spatial/knn/specializations.cuh>
 #include <raft/stats/adjusted_rand_index.cuh>
@@ -460,8 +457,7 @@ class AllPointsMembershipVectorsTest
                                               prediction_data_,
                                               data.data(),
                                               raft::distance::DistanceType::L2SqrtExpanded,
-                                              membership_vec.data(),
-                                              0);
+                                              membership_vec.data());
 
     ASSERT_TRUE(MLCommon::devArrMatch(membership_vec.data(),
                                       params.expected_probabilities.data(),
@@ -755,8 +751,7 @@ class MembershipVectorTest : public ::testing::TestWithParam<MembershipVectorInp
                                   params.n_points_to_predict,
                                   params.min_samples,
                                   raft::distance::DistanceType::L2SqrtExpanded,
-                                  membership_vec.data(),
-                                  0);
+                                  membership_vec.data());
 
     ASSERT_TRUE(MLCommon::devArrMatch(membership_vec.data(),
                                       params.expected_probabilities.data(),

@@ -95,7 +95,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
         float* X,
         DistanceType metric,
         float* membership_vec,
-        int batch_size)
+        size_t batch_size)
 
     void compute_membership_vector(
         const handle_t& handle,
@@ -107,7 +107,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
         int min_samples,
         DistanceType metric,
         float* membership_vec,
-        int batch_size);
+        size_t batch_size);
 
     void out_of_sample_predict(const handle_t &handle,
                                CondensedHierarchy[int, float] &condensed_tree,
@@ -131,7 +131,7 @@ _metrics_mapping = {
 }
 
 
-def all_points_membership_vectors(clusterer, batch_size=0):
+def all_points_membership_vectors(clusterer, batch_size=4096):
 
     """
     Predict soft cluster membership vectors for all points in the
@@ -145,11 +145,13 @@ def all_points_membership_vectors(clusterer, batch_size=0):
         A clustering object that has been fit to the data and
         had ``prediction_data=True`` set.
 
-    batch_size : int, optional, default=0
+    batch_size : int, optional, default=min(4096, n_rows)
         Lowers memory requirement by computing distance-based membership in
         smaller batches of points in the training data. Batch size of 0 uses
         all of the training points, batch size of 1000 computes distances for
-        1000 points at a time.
+        1000 points at a time. The default batch_size is 4096. If the number
+        of rows in the original dataset is less than 4096, this defaults to
+        the number of rows.
 
     Returns
     -------
@@ -214,6 +216,7 @@ def all_points_membership_vectors(clusterer, batch_size=0):
         <CondensedHierarchy[int, float]*><size_t> clusterer.condensed_tree_ptr
 
     cdef handle_t* handle_ = <handle_t*><size_t>clusterer.handle.getHandle()
+
     compute_all_points_membership_vectors(handle_[0],
                                           deref(condensed_tree),
                                           deref(prediction_data_),
@@ -229,7 +232,7 @@ def all_points_membership_vectors(clusterer, batch_size=0):
                                          clusterer.n_clusters_))
 
 
-def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype=True):
+def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dtype=True):
     """Predict soft cluster membership. The result produces a vector
     for each point in ``points_to_predict`` that gives a probability that
     the given point is a member of a cluster for each of the selected clusters
@@ -247,11 +250,13 @@ def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype=
         have the same dimensionality as the original dataset over which
         clusterer was fit.
 
-    batch_size : int, optional, default=0
+    batch_size : int, optional, default=min(4096, n_points_to_predict)
         Lowers memory requirement by computing distance-based membership in
         smaller batches of points in the training data. Batch size of 0 uses
         all of the training points, batch size of 1000 computes distances for
-        1000 points at a time.
+        1000 points at a time. The default batch_size is 4096. If the number
+        of rows in the original dataset is less than 4096, this defaults to
+        the number of rows.
 
     Returns
     -------

@@ -531,7 +531,7 @@ def test_hdbscan_plots():
 @pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5])
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("batch_size", [0, 128])
+@pytest.mark.parametrize("batch_size", [128, 1000])
 def test_all_points_membership_vectors_blobs(
     nrows,
     ncols,
@@ -593,7 +593,7 @@ def test_all_points_membership_vectors_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("batch_size", [0, 128])
+@pytest.mark.parametrize("batch_size", [128, 1000])
 def test_all_points_membership_vectors_moons(
     nrows,
     min_samples,
@@ -650,7 +650,7 @@ def test_all_points_membership_vectors_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("batch_size", [0, 128])
+@pytest.mark.parametrize("batch_size", [128, 1000])
 def test_all_points_membership_vectors_circles(
     nrows,
     min_samples,
@@ -981,7 +981,7 @@ def test_approximate_predict_digits(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("allow_single_cluster", [True, False])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
-@pytest.mark.parametrize("batch_size", [0, 128])
+@pytest.mark.parametrize("batch_size", [128])
 def test_membership_vector_blobs(
     nrows,
     n_points_to_predict,
@@ -1057,7 +1057,7 @@ def test_membership_vector_blobs(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("batch_size", [0, 16])
+@pytest.mark.parametrize("batch_size", [16])
 def test_membership_vector_moons(
     nrows,
     n_points_to_predict,
@@ -1121,7 +1121,7 @@ def test_membership_vector_moons(
 @pytest.mark.parametrize("max_cluster_size", [0])
 @pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
 @pytest.mark.parametrize("connectivity", ["knn"])
-@pytest.mark.parametrize("batch_size", [0, 16])
+@pytest.mark.parametrize("batch_size", [16])
 def test_membership_vector_circles(
     nrows,
     n_points_to_predict,