From 884ef961d074831a4309096d976c46a291632840 Mon Sep 17 00:00:00 2001 From: rvch7 Date: Wed, 5 Jul 2023 18:58:40 -0500 Subject: [PATCH 1/4] Restrict HDBSCAN metric options to L2 #5415 --- python/cuml/cluster/hdbscan/hdbscan.pyx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index aecfc0ff96..0d7f1b0479 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -165,12 +165,8 @@ cdef extern from "cuml/cluster/hdbscan.hpp" namespace "ML::HDBSCAN::HELPER": float cluster_selection_epsilon) except + _metrics_mapping = { - 'l1': DistanceType.L1, - 'cityblock': DistanceType.L1, - 'manhattan': DistanceType.L1, 'l2': DistanceType.L2SqrtExpanded, 'euclidean': DistanceType.L2SqrtExpanded, - 'cosine': DistanceType.CosineExpanded } @@ -838,7 +834,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): if self.metric in _metrics_mapping: metric = _metrics_mapping[self.metric] else: - raise ValueError("'affinity' %s not supported." % self.affinity) + raise ValueError("'metric' %s not supported(only 'l2' or 'euclidean' is supported)" % self.metric) cdef uintptr_t core_dists_ptr = self.core_dists.ptr From 6b4f63508a82267f023af738d6fec1f77650b424 Mon Sep 17 00:00:00 2001 From: rvch7 Date: Fri, 7 Jul 2023 22:56:39 -0500 Subject: [PATCH 2/4] unit test for HDBSCAN restrict to L2 issue #5415 --- python/cuml/tests/test_hdbscan.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 780404bf42..1ff265c591 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -473,6 +473,24 @@ def test_hdbscan_core_dists_bug_4054(): ).fit_predict(X) assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99 + + +@pytest.mark.parametrize("_metric",[ + "euclidean", + "l2", + pytest.param("l1", marks = pytest.mark.xfail), + pytest.param("L2", marks = pytest.mark.xfail) +]) +def test_hdbscan_metric_l2_5415(_metric): + """ + this test verifies that only l2 and euclidean + metric are acceptable + """ + + X, y = make_blobs(n_samples=10000, n_features=15, random_state=12) + + clf = HDBSCAN(metric=_metric) + assert isinstance(clf.fit(X),HDBSCAN) def test_hdbscan_empty_cluster_tree(): From 26ff2b5d80e708e7827aee18ad211a5b10929b6f Mon Sep 17 00:00:00 2001 From: Rvch7 Date: Wed, 12 Jul 2023 21:43:43 -0400 Subject: [PATCH 3/4] updated unit-test --- python/cuml/tests/test_hdbscan.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index 1ff265c591..abea35c155 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -473,24 +473,25 @@ def test_hdbscan_core_dists_bug_4054(): ).fit_predict(X) assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99 - - -@pytest.mark.parametrize("_metric",[ - "euclidean", - "l2", - pytest.param("l1", marks = pytest.mark.xfail), - pytest.param("L2", marks = pytest.mark.xfail) -]) -def test_hdbscan_metric_l2_5415(_metric): + + +@pytest.mark.parametrize( + "_metric,expected_to_fail", + [("euclidean", False), ("l2", False), ("l1", True), ("L2", True)], +) +def test_hdbscan_metric_l2_5415(_metric, expected_to_fail): """ this test verifies that only l2 and euclidean - metric are acceptable + metric are acceptable """ - X, y = make_blobs(n_samples=10000, n_features=15, random_state=12) clf = HDBSCAN(metric=_metric) - assert isinstance(clf.fit(X),HDBSCAN) + if expected_to_fail: + with pytest.raises(ValueError): + clf.fit(X) + else: + clf.fit(X) def test_hdbscan_empty_cluster_tree(): From 54a065a525c80e44307e368691264f8cc75491bb Mon Sep 17 00:00:00 2001 From: Rvch7 Date: Thu, 13 Jul 2023 17:00:14 -0400 Subject: [PATCH 4/4] requested changes added --- python/cuml/cluster/hdbscan/hdbscan.pyx | 2 +- python/cuml/tests/test_hdbscan.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/python/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cluster/hdbscan/hdbscan.pyx index 0d7f1b0479..c795fcea53 100644 --- a/python/cuml/cluster/hdbscan/hdbscan.pyx +++ b/python/cuml/cluster/hdbscan/hdbscan.pyx @@ -834,7 +834,7 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin): if self.metric in _metrics_mapping: metric = _metrics_mapping[self.metric] else: - raise ValueError("'metric' %s not supported(only 'l2' or 'euclidean' is supported)" % self.metric) + raise ValueError(f"metric '{self.metric}' not supported, only 'l2' and 'euclidean' are currently supported") cdef uintptr_t core_dists_ptr = self.core_dists.ptr diff --git a/python/cuml/tests/test_hdbscan.py b/python/cuml/tests/test_hdbscan.py index abea35c155..0a9a3a6382 100644 --- a/python/cuml/tests/test_hdbscan.py +++ b/python/cuml/tests/test_hdbscan.py @@ -476,22 +476,22 @@ def test_hdbscan_core_dists_bug_4054(): @pytest.mark.parametrize( - "_metric,expected_to_fail", - [("euclidean", False), ("l2", False), ("l1", True), ("L2", True)], + "metric, supported", + [("euclidean", True), ("l1", False), ("l2", True), ("abc", False)], ) -def test_hdbscan_metric_l2_5415(_metric, expected_to_fail): +def test_hdbscan_metric_parameter_input(metric, supported): """ - this test verifies that only l2 and euclidean - metric are acceptable + tests how valid and invalid arguments to the metric + parameter are handled """ X, y = make_blobs(n_samples=10000, n_features=15, random_state=12) - clf = HDBSCAN(metric=_metric) - if expected_to_fail: + clf = HDBSCAN(metric=metric) + if supported: + clf.fit(X) + else: with pytest.raises(ValueError): clf.fit(X) - else: - clf.fit(X) def test_hdbscan_empty_cluster_tree():