Fix fit_transform on KMeans (#4055)

Closes #4043 Authors: - Micka (https://github.com/lowener) Approvers: - Dante Gama Dessavre (https://github.com/dantegd) URL: #4055
rapidsai · Jul 16, 2021 · eaa2add · eaa2add
1 parent 16dbaa8
commit eaa2add
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 3 deletions.
diff --git a/python/cuml/cluster/kmeans.pyx b/python/cuml/cluster/kmeans.pyx
@@ -625,8 +625,8 @@ class KMeans(Base,
         Compute clustering and transform X to cluster-distance space.
 
         """
-        return self.fit(X).transform(X, convert_dtype=convert_dtype,
-                                     sample_weight=sample_weight)
+        self.fit(X, sample_weight=sample_weight)
+        return self.transform(X, convert_dtype=convert_dtype)
 
     def get_param_names(self):
         return super().get_param_names() + \

diff --git a/python/cuml/test/test_kmeans.py b/python/cuml/test/test_kmeans.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2020, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -323,3 +323,42 @@ def test_score(nrows, ncols, nclusters, random_state):
 
     cp.testing.assert_allclose(
         actual_score, expected_score, atol=0.1, rtol=1e-5)
+
+
+@pytest.mark.parametrize('nrows', [100])
+@pytest.mark.parametrize('ncols', [25])
+@pytest.mark.parametrize('nclusters', [5])
+@pytest.mark.parametrize('max_weight', [10])
+def test_fit_transform_weighted_kmeans(nrows, ncols, nclusters,
+                                       max_weight, random_state):
+
+    # Using fairly high variance between points in clusters
+    cluster_std = 1.0
+    np.random.seed(random_state)
+
+    # set weight per sample to be from 1 to max_weight
+    wt = np.random.randint(1, high=max_weight, size=nrows)
+
+    X, y = make_blobs(nrows,
+                      ncols,
+                      nclusters,
+                      cluster_std=cluster_std,
+                      shuffle=False,
+                      random_state=0)
+
+    cuml_kmeans = cuml.KMeans(init="k-means++",
+                              n_clusters=nclusters,
+                              n_init=10,
+                              random_state=random_state,
+                              output_type='numpy')
+
+    cuml_transf = cuml_kmeans.fit_transform(X, sample_weight=wt)
+    cu_score = cuml_kmeans.score(X)
+
+    sk_kmeans = cluster.KMeans(random_state=random_state,
+                               n_clusters=nclusters)
+    sk_transf = sk_kmeans.fit_transform(cp.asnumpy(X), sample_weight=wt)
+    sk_score = sk_kmeans.score(cp.asnumpy(X))
+
+    assert abs(cu_score - sk_score) <= cluster_std * 1.5
+    assert sk_transf.shape == cuml_transf.shape