From 2c1d02d054fe1a8627b8610e8dd6de226b46af55 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Fri, 20 Jan 2017 17:04:21 -0800
Subject: [PATCH 1/3] fix kmeans bug

---
 R/pkg/R/mllib_clustering.R                        | 15 +++++++++------
 R/pkg/inst/tests/testthat/test_mllib_clustering.R | 15 +++++++++++----
 .../org/apache/spark/ml/r/KMeansWrapper.scala     |  2 ++
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index fb8d9e75ad091..2676815a40139 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -225,10 +225,12 @@ setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"
 
 #' @param object a fitted k-means model.
 #' @return \code{summary} returns summary information of the fitted model, which is a list.
-#'         The list includes the model's \code{k} (number of cluster centers),
+#'         The list includes the model's \code{k} (the configured number of cluster centers),
 #'         \code{coefficients} (model cluster centers),
-#'         \code{size} (number of data points in each cluster), and \code{cluster}
-#'         (cluster centers of the transformed data).
+#'         \code{size} (number of data points in each cluster), \code{cluster}
+#'         (cluster centers of the transformed data), and \code{clusterSize}
+#'         (the actual number of cluster centers. When using initMode = "random",
+#'         \code{clusterSize} may not equal to \code{k}).
 #' @rdname spark.kmeans
 #' @export
 #' @note summary(KMeansModel) since 2.0.0
@@ -240,16 +242,17 @@ setMethod("summary", signature(object = "KMeansModel"),
             coefficients <- callJMethod(jobj, "coefficients")
             k <- callJMethod(jobj, "k")
             size <- callJMethod(jobj, "size")
-            coefficients <- t(matrix(coefficients, ncol = k))
+            clusterSize <- callJMethod(jobj, "clusterSize")
+            coefficients <- t(matrix(coefficients, ncol = clusterSize))
             colnames(coefficients) <- unlist(features)
-            rownames(coefficients) <- 1:k
+            rownames(coefficients) <- 1:clusterSize
             cluster <- if (is.loaded) {
               NULL
             } else {
               dataFrame(callJMethod(jobj, "cluster"))
             }
             list(k = k, coefficients = coefficients, size = size,
-                 cluster = cluster, is.loaded = is.loaded)
+                 cluster = cluster, is.loaded = is.loaded, clusterSize = clusterSize)
           })
 
 #  Predicted values based on a k-means model
diff --git a/R/pkg/inst/tests/testthat/test_mllib_clustering.R b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
index cfbdea5c041fb..b44d05ee15e6e 100644
--- a/R/pkg/inst/tests/testthat/test_mllib_clustering.R
+++ b/R/pkg/inst/tests/testthat/test_mllib_clustering.R
@@ -145,13 +145,20 @@ test_that("spark.kmeans", {
   model2 <- spark.kmeans(data = df, ~ ., k = 5, maxIter = 10,
                          initMode = "random", seed = 22222, tol = 1E-5)
 
-  fitted.model1 <- fitted(model1)
-  fitted.model2 <- fitted(model2)
+  summary.model1 <- summary(model1)
+  summary.model2 <- summary(model2)
+  cluster1 <- summary.model1$cluster
+  cluster2 <- summary.model2$cluster
+  clusterSize1 <- summary.model1$clusterSize
+  clusterSize2 <- summary.model2$clusterSize
+
   # The predicted clusters are different
-  expect_equal(sort(collect(distinct(select(fitted.model1, "prediction")))$prediction),
+  expect_equal(sort(collect(distinct(select(cluster1, "prediction")))$prediction),
              c(0, 1, 2, 3))
-  expect_equal(sort(collect(distinct(select(fitted.model2, "prediction")))$prediction),
+  expect_equal(sort(collect(distinct(select(cluster2, "prediction")))$prediction),
              c(0, 1, 2))
+  expect_equal(clusterSize1, 4)
+  expect_equal(clusterSize2, 3)
 })
 
 test_that("spark.lda with libsvm", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
index a1fefd31c0579..8d596863b459e 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/r/KMeansWrapper.scala
@@ -43,6 +43,8 @@ private[r] class KMeansWrapper private (
 
   lazy val cluster: DataFrame = kMeansModel.summary.cluster
 
+  lazy val clusterSize: Int = kMeansModel.clusterCenters.size
+
   def fitted(method: String): DataFrame = {
     if (method == "centers") {
       kMeansModel.summary.predictions.drop(kMeansModel.getFeaturesCol)

From d1a2d6c9a83adc184dcc88ec3fd78b63ede39b89 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Sun, 22 Jan 2017 22:59:18 -0800
Subject: [PATCH 2/3]  add is.loaded in comment

---
 R/pkg/R/mllib_clustering.R | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index 2676815a40139..52e1d314d4260 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -228,7 +228,8 @@ setMethod("spark.kmeans", signature(data = "SparkDataFrame", formula = "formula"
 #'         The list includes the model's \code{k} (the configured number of cluster centers),
 #'         \code{coefficients} (model cluster centers),
 #'         \code{size} (number of data points in each cluster), \code{cluster}
-#'         (cluster centers of the transformed data), and \code{clusterSize}
+#'         (cluster centers of the transformed data), {is.loaded} (whether the model is loaded
+#'         from a saved file), and \code{clusterSize}
 #'         (the actual number of cluster centers. When using initMode = "random",
 #'         \code{clusterSize} may not equal to \code{k}).
 #' @rdname spark.kmeans

From 72fb951205599117c9cf4cbabc89801437c5f735 Mon Sep 17 00:00:00 2001
From: "wm624@hotmail.com" <wm624@hotmail.com>
Date: Tue, 31 Jan 2017 15:16:02 -0800
Subject: [PATCH 3/3] add fix

---
 R/pkg/R/mllib_clustering.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/pkg/R/mllib_clustering.R b/R/pkg/R/mllib_clustering.R
index 101205f86182f..8823f90775960 100644
--- a/R/pkg/R/mllib_clustering.R
+++ b/R/pkg/R/mllib_clustering.R
@@ -394,7 +394,7 @@ setMethod("summary", signature(object = "KMeansModel"),
             k <- callJMethod(jobj, "k")
             size <- callJMethod(jobj, "size")
             clusterSize <- callJMethod(jobj, "clusterSize")
-            coefficients <- t(matrix(coefficients, ncol = clusterSize))
+            coefficients <- t(matrix(unlist(coefficients), ncol = clusterSize))
             colnames(coefficients) <- unlist(features)
             rownames(coefficients) <- 1:clusterSize
             cluster <- if (is.loaded) {