Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEA] Modify default batch size in HDBSCAN soft clustering #5335

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
e49d06a
membership_vector initial commit
tarang-jain Feb 18, 2023
436b180
Further updates to membership_vector
tarang-jain Feb 22, 2023
48030b8
Merge branch 'branch-23.04' into fea-membership-vector
tarang-jain Feb 22, 2023
7912dba
Initial testing membership_vector
tarang-jain Feb 23, 2023
4b41edb
Debug statements
tarang-jain Feb 23, 2023
fe0fd34
Merge branch 'fea-membership-vector' of https://github.com/tarang-jai…
tarang-jain Feb 23, 2023
9d5badc
debugging membership_vector
tarang-jain Feb 24, 2023
19f9dd8
membership_vector first working impl
tarang-jain Feb 28, 2023
a4b565c
GoogleTest intermediate commit
tarang-jain Feb 28, 2023
1f4bf78
GTest working
tarang-jain Feb 28, 2023
fdf100b
working tests and styling changes
tarang-jain Feb 28, 2023
e18096a
replace with raft mdspan primitives and add FastIntDiv
tarang-jain Mar 1, 2023
c2aa77e
Merge branch 'branch-23.04' into fea-membership-vector
tarang-jain Mar 1, 2023
182ba31
cpu support
tarang-jain Mar 1, 2023
366ef26
Fix failing pytest
tarang-jain Mar 7, 2023
b60d869
Merge branch 'branch-23.04' into fea-membership-vector
tarang-jain Mar 7, 2023
6bfaae2
modification after merge
tarang-jain Mar 7, 2023
c4e0bf1
Update softmax with raft::linalg reduction
tarang-jain Mar 8, 2023
fb634e4
Remove sync stream
tarang-jain Mar 9, 2023
a49ba87
memory study commit (to be reversed)
tarang-jain Mar 11, 2023
4ed9fd7
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-memb…
tarang-jain Mar 11, 2023
d1712c0
first commit (working)
tarang-jain Mar 13, 2023
f41416a
set batch_size as an arg
tarang-jain Mar 14, 2023
333077a
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Mar 14, 2023
71217e2
working build, styling changes
tarang-jain Mar 14, 2023
bdaefa5
batch_size added to membership_vector
tarang-jain Mar 17, 2023
04e76cb
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Mar 17, 2023
fa7b44e
Style fix
tarang-jain Mar 17, 2023
45f8ca4
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-memb…
tarang-jain Mar 17, 2023
367de04
Remove print debug statements
tarang-jain Mar 17, 2023
eeb52c2
Resolved failing pytest
tarang-jain Mar 20, 2023
612afb1
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Mar 20, 2023
980b1f7
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-memb…
tarang-jain Mar 20, 2023
0bf779b
copyright changes
tarang-jain Mar 20, 2023
98aa237
Merge branch 'branch-23.04' into fea-membership-vector
tarang-jain Mar 20, 2023
3a38769
Merge branch 'branch-23.04' into fea-new-reduce-memory-pressure-apmv
tarang-jain Mar 20, 2023
d387026
Merge branch 'branch-23.04' into fea-membership-vector
tarang-jain Mar 27, 2023
ed40e22
Updates after PR reviews
tarang-jain Mar 28, 2023
387cde8
Merge branch 'fea-membership-vector' of https://github.com/tarang-jai…
tarang-jain Mar 28, 2023
092b3f8
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-memb…
tarang-jain Mar 28, 2023
ef85fd3
Update height_argmax
tarang-jain Mar 28, 2023
52eda5c
Intermediate merge commit
tarang-jain Mar 29, 2023
d8da560
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Mar 29, 2023
7a95bfe
Update after merge membership_vector
tarang-jain Mar 29, 2023
dc92f90
Updates after PR Reviews
tarang-jain Mar 30, 2023
615ad10
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Mar 30, 2023
7b89484
Merge branch 'branch-23.04' into fea-new-reduce-memory-pressure-apmv
tarang-jain Mar 30, 2023
7f7f0a4
Resolve merge conflicts
tarang-jain Mar 31, 2023
f37b05d
Merge branch 'branch-23.04' of github.com:rapidsai/cuml into fea-new-…
tarang-jain Apr 5, 2023
e5ada81
Update default batch_size
tarang-jain Apr 5, 2023
7335b73
Update docs
tarang-jain Apr 5, 2023
6318ec8
Rerun tests
tarang-jain Apr 5, 2023
35f7dfd
Merge branch 'branch-23.04' into fea-new-reduce-memory-pressure-apmv
dantegd Apr 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions cpp/include/cuml/cluster/hdbscan.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ void compute_all_points_membership_vectors(
const float* X,
raft::distance::DistanceType metric,
float* membership_vec,
int batch_size);
size_t batch_size = 4096);

void compute_membership_vector(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
Expand All @@ -470,7 +470,7 @@ void compute_membership_vector(const raft::handle_t& handle,
int min_samples,
raft::distance::DistanceType metric,
float* membership_vec,
int batch_size);
size_t batch_size = 4096);

void out_of_sample_predict(const raft::handle_t& handle,
HDBSCAN::Common::CondensedHierarchy<int, float>& condensed_tree,
Expand Down
25 changes: 12 additions & 13 deletions cpp/src/hdbscan/detail/soft_clustering.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void dist_membership_vector(const raft::handle_t& handle,
value_idx* exemplar_label_offsets,
value_t* dist_membership_vec,
raft::distance::DistanceType metric,
int batch_size,
size_t batch_size,
bool softmax = false)
{
auto stream = handle.get_stream();
Expand All @@ -82,16 +82,11 @@ void dist_membership_vector(const raft::handle_t& handle,
// compute the number of batches based on the batch size
value_idx n_batches;

if (batch_size == 0) {
n_batches = 1;
batch_size = n_queries;
}
else {
n_batches = raft::ceildiv((int)n_queries, (int)batch_size);
}
n_batches = raft::ceildiv((int)n_queries, (int)batch_size);

for(value_idx bid = 0; bid < n_batches; bid++) {
value_idx batch_offset = bid * batch_size;
value_idx samples_per_batch = min(batch_size, (int)n_queries - batch_offset);
value_idx samples_per_batch = min((value_idx)batch_size, (value_idx)n_queries - batch_offset);
rmm::device_uvector<value_t> dist(samples_per_batch * n_exemplars, stream);

// compute the distances using raft API
Expand Down Expand Up @@ -392,14 +387,16 @@ void all_points_membership_vectors(const raft::handle_t& handle,
const value_t* X,
raft::distance::DistanceType metric,
value_t* membership_vec,
value_idx batch_size)
size_t batch_size)
{
auto stream = handle.get_stream();
auto exec_policy = handle.get_thrust_policy();

size_t m = prediction_data.n_rows;
size_t n = prediction_data.n_cols;
RAFT_EXPECTS(0 <= batch_size && batch_size <= m, "Invalid batch_size. batch_size should be >= 0 and <= the number of samples in the training data");

if (batch_size > m) batch_size = m;
RAFT_EXPECTS(0 < batch_size && batch_size <= m, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");

auto parents = condensed_tree.get_parents();
auto children = condensed_tree.get_children();
Expand Down Expand Up @@ -507,11 +504,10 @@ void membership_vector(const raft::handle_t& handle,
raft::distance::DistanceType metric,
int min_samples,
value_t* membership_vec,
value_idx batch_size)
size_t batch_size)
{
RAFT_EXPECTS(metric == raft::distance::DistanceType::L2SqrtExpanded,
"Currently only L2 expanded distance is supported");
RAFT_EXPECTS(0 <= batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be >= 0 and <= the number of points to predict");

auto stream = handle.get_stream();
auto exec_policy = handle.get_thrust_policy();
Expand All @@ -525,6 +521,9 @@ void membership_vector(const raft::handle_t& handle,
value_idx n_exemplars = prediction_data.get_n_exemplars();
value_t* lambdas = condensed_tree.get_lambdas();

if (batch_size > n_prediction_points) batch_size = n_prediction_points;
RAFT_EXPECTS(0 < batch_size && batch_size <= n_prediction_points, "Invalid batch_size. batch_size should be > 0 and <= the number of samples in the training data");

rmm::device_uvector<value_t> dist_membership_vec(n_prediction_points * n_selected_clusters,
stream);

Expand Down
4 changes: 2 additions & 2 deletions cpp/src/hdbscan/hdbscan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ void compute_all_points_membership_vectors(
const float* X,
raft::distance::DistanceType metric,
float* membership_vec,
int batch_size)
size_t batch_size)
{
HDBSCAN::detail::Predict::all_points_membership_vectors(
handle, condensed_tree, prediction_data, X, metric, membership_vec, batch_size);
Expand All @@ -108,7 +108,7 @@ void compute_membership_vector(const raft::handle_t& handle,
int min_samples,
raft::distance::DistanceType metric,
float* membership_vec,
int batch_size)
size_t batch_size)
{
// Note that (min_samples+1) is parsed to the approximate_predict function. This was done for the
// core distance computation to consistent with Scikit learn Contrib.
Expand Down
9 changes: 2 additions & 7 deletions cpp/test/sg/hdbscan_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,7 @@
#include <cuml/cluster/hdbscan.hpp>
#include <hdbscan/detail/condense.cuh>
#include <hdbscan/detail/extract.cuh>
#include <hdbscan/detail/predict.cuh>
#include <hdbscan/detail/reachability.cuh>
#include <hdbscan/detail/soft_clustering.cuh>
#include <hdbscan/detail/utils.h>

#include <raft/spatial/knn/specializations.cuh>
#include <raft/stats/adjusted_rand_index.cuh>
Expand Down Expand Up @@ -460,8 +457,7 @@ class AllPointsMembershipVectorsTest
prediction_data_,
data.data(),
raft::distance::DistanceType::L2SqrtExpanded,
membership_vec.data(),
0);
membership_vec.data());

ASSERT_TRUE(MLCommon::devArrMatch(membership_vec.data(),
params.expected_probabilities.data(),
Expand Down Expand Up @@ -755,8 +751,7 @@ class MembershipVectorTest : public ::testing::TestWithParam<MembershipVectorInp
params.n_points_to_predict,
params.min_samples,
raft::distance::DistanceType::L2SqrtExpanded,
membership_vec.data(),
0);
membership_vec.data());

ASSERT_TRUE(MLCommon::devArrMatch(membership_vec.data(),
params.expected_probabilities.data(),
Expand Down
21 changes: 13 additions & 8 deletions python/cuml/cluster/hdbscan/prediction.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
float* X,
DistanceType metric,
float* membership_vec,
int batch_size)
size_t batch_size)

void compute_membership_vector(
const handle_t& handle,
Expand All @@ -107,7 +107,7 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML":
int min_samples,
DistanceType metric,
float* membership_vec,
int batch_size);
size_t batch_size);

void out_of_sample_predict(const handle_t &handle,
CondensedHierarchy[int, float] &condensed_tree,
Expand All @@ -131,7 +131,7 @@ _metrics_mapping = {
}


def all_points_membership_vectors(clusterer, batch_size=0):
def all_points_membership_vectors(clusterer, batch_size=4096):

"""
Predict soft cluster membership vectors for all points in the
Expand All @@ -145,11 +145,13 @@ def all_points_membership_vectors(clusterer, batch_size=0):
A clustering object that has been fit to the data and
had ``prediction_data=True`` set.

batch_size : int, optional, default=0
batch_size : int, optional, default=min(4096, n_rows)
Lowers memory requirement by computing distance-based membership in
smaller batches of points in the training data. Batch size of 0 uses
all of the training points, batch size of 1000 computes distances for
1000 points at a time.
1000 points at a time. The default batch_size is 4096. If the number
of rows in the original dataset is less than 4096, this defaults to
the number of rows.

Returns
-------
Expand Down Expand Up @@ -214,6 +216,7 @@ def all_points_membership_vectors(clusterer, batch_size=0):
<CondensedHierarchy[int, float]*><size_t> clusterer.condensed_tree_ptr

cdef handle_t* handle_ = <handle_t*><size_t>clusterer.handle.getHandle()

compute_all_points_membership_vectors(handle_[0],
deref(condensed_tree),
deref(prediction_data_),
Expand All @@ -229,7 +232,7 @@ def all_points_membership_vectors(clusterer, batch_size=0):
clusterer.n_clusters_))


def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype=True):
def membership_vector(clusterer, points_to_predict, batch_size=4096, convert_dtype=True):
"""Predict soft cluster membership. The result produces a vector
for each point in ``points_to_predict`` that gives a probability that
the given point is a member of a cluster for each of the selected clusters
Expand All @@ -247,11 +250,13 @@ def membership_vector(clusterer, points_to_predict, batch_size=0, convert_dtype=
have the same dimensionality as the original dataset over which
clusterer was fit.

batch_size : int, optional, default=0
batch_size : int, optional, default=min(4096, n_points_to_predict)
Lowers memory requirement by computing distance-based membership in
smaller batches of points in the training data. Batch size of 0 uses
all of the training points, batch size of 1000 computes distances for
1000 points at a time.
1000 points at a time. The default batch_size is 4096. If the number
of rows in the original dataset is less than 4096, this defaults to
the number of rows.

Returns
-------
Expand Down
12 changes: 6 additions & 6 deletions python/cuml/tests/test_hdbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -531,7 +531,7 @@ def test_hdbscan_plots():
@pytest.mark.parametrize("cluster_selection_epsilon", [0.0, 0.5])
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("batch_size", [0, 128])
@pytest.mark.parametrize("batch_size", [128, 1000])
def test_all_points_membership_vectors_blobs(
nrows,
ncols,
Expand Down Expand Up @@ -593,7 +593,7 @@ def test_all_points_membership_vectors_blobs(
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("connectivity", ["knn"])
@pytest.mark.parametrize("batch_size", [0, 128])
@pytest.mark.parametrize("batch_size", [128, 1000])
def test_all_points_membership_vectors_moons(
nrows,
min_samples,
Expand Down Expand Up @@ -650,7 +650,7 @@ def test_all_points_membership_vectors_moons(
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("connectivity", ["knn"])
@pytest.mark.parametrize("batch_size", [0, 128])
@pytest.mark.parametrize("batch_size", [128, 1000])
def test_all_points_membership_vectors_circles(
nrows,
min_samples,
Expand Down Expand Up @@ -981,7 +981,7 @@ def test_approximate_predict_digits(
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("allow_single_cluster", [True, False])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("batch_size", [0, 128])
@pytest.mark.parametrize("batch_size", [128])
def test_membership_vector_blobs(
nrows,
n_points_to_predict,
Expand Down Expand Up @@ -1057,7 +1057,7 @@ def test_membership_vector_blobs(
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("connectivity", ["knn"])
@pytest.mark.parametrize("batch_size", [0, 16])
@pytest.mark.parametrize("batch_size", [16])
def test_membership_vector_moons(
nrows,
n_points_to_predict,
Expand Down Expand Up @@ -1121,7 +1121,7 @@ def test_membership_vector_moons(
@pytest.mark.parametrize("max_cluster_size", [0])
@pytest.mark.parametrize("cluster_selection_method", ["eom", "leaf"])
@pytest.mark.parametrize("connectivity", ["knn"])
@pytest.mark.parametrize("batch_size", [0, 16])
@pytest.mark.parametrize("batch_size", [16])
def test_membership_vector_circles(
nrows,
n_points_to_predict,
Expand Down