apache · mgaido91 · Feb 28, 2018 · Mar 1, 2018 · Mar 10, 2018 · Mar 16, 2018
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -276,7 +276,7 @@ class BisectingKMeans @Since("2.0.0") (
     val parentModel = bkm.run(rdd)
     val model = copyValues(new BisectingKMeansModel(uid, parentModel).setParent(this))
     val summary = new BisectingKMeansSummary(
-      model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
+      model.transform(dataset), $(predictionCol), $(featuresCol), $(k), $(maxIter))
     model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
@@ -312,4 +312,5 @@ class BisectingKMeansSummary private[clustering] (
     predictions: DataFrame,
     predictionCol: String,
     featuresCol: String,
-    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
+    k: Int,
+    numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/ClusteringSummary.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.ml.clustering
 
-import org.apache.spark.annotation.Experimental
+import org.apache.spark.annotation.{Experimental, Since}
 import org.apache.spark.sql.{DataFrame, Row}
 
 /**
@@ -34,7 +34,8 @@ class ClusteringSummary private[clustering] (
     @transient val predictions: DataFrame,
     val predictionCol: String,
     val featuresCol: String,
-    val k: Int) extends Serializable {
+    val k: Int,
+    @Since("2.4.0") val numIter: Int) extends Serializable {
 
   /**
    * Cluster centers of the transformed data.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -421,7 +421,7 @@ class GaussianMixture @Since("2.0.0") (
 
     val model = copyValues(new GaussianMixtureModel(uid, weights, gaussianDists)).setParent(this)
     val summary = new GaussianMixtureSummary(model.transform(dataset),
-      $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
+      $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood, iter)
     model.setSummary(Some(summary))
     instr.logSuccess(model)
     model
@@ -692,8 +692,9 @@ class GaussianMixtureSummary private[clustering] (
     @Since("2.0.0") val probabilityCol: String,
     featuresCol: String,
     k: Int,
-    @Since("2.2.0") val logLikelihood: Double)
-  extends ClusteringSummary(predictions, predictionCol, featuresCol, k) {
+    @Since("2.2.0") val logLikelihood: Double,
+    numIter: Int)
+  extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) {
 
   /**
    * Probability of each cluster.

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -332,7 +332,7 @@ class KMeans @Since("1.5.0") (
     val parentModel = algo.run(instances, Option(instr))
     val model = copyValues(new KMeansModel(uid, parentModel).setParent(this))
     val summary = new KMeansSummary(
-      model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
+      model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter)
 
     model.setSummary(Some(summary))
     instr.logSuccess(model)
@@ -370,4 +370,5 @@ class KMeansSummary private[clustering] (
     predictions: DataFrame,
     predictionCol: String,
     featuresCol: String,
-    k: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k)
+    k: Int,
+    numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala
@@ -348,7 +348,7 @@ class KMeans private (
 
     logInfo(s"The cost is $cost.")
 
-    new KMeansModel(centers.map(_.vector), distanceMeasure)
+    new KMeansModel(centers.map(_.vector), distanceMeasure, iteration)
   }
 
   /**

diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala
@@ -36,8 +36,9 @@ import org.apache.spark.sql.{Row, SparkSession}
  * A clustering model for K-means. Each point belongs to the cluster with the closest center.
  */
 @Since("0.8.0")
-class KMeansModel @Since("2.4.0") (@Since("1.0.0") val clusterCenters: Array[Vector],
-  @Since("2.4.0") val distanceMeasure: String)
+class KMeansModel private[spark] (@Since("1.0.0") val clusterCenters: Array[Vector],
+  @Since("2.4.0") val distanceMeasure: String,
+  private[spark] val numIter: Int)
   extends Saveable with Serializable with PMMLExportable {
 
   private val distanceMeasureInstance: DistanceMeasure =
@@ -46,6 +47,10 @@ class KMeansModel @Since("2.4.0") (@Since("1.0.0") val clusterCenters: Array[Vec
   private val clusterCentersWithNorm =
     if (clusterCenters == null) null else clusterCenters.map(new VectorWithNorm(_))
 
+  @Since("2.4.0")
+  def this(clusterCenters: Array[Vector], distanceMeasure: String) =
+    this(clusterCenters: Array[Vector], distanceMeasure, -1)
+
   @Since("1.1.0")
   def this(clusterCenters: Array[Vector]) =
     this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/BisectingKMeansSuite.scala
@@ -99,6 +99,7 @@ class BisectingKMeansSuite
   test("fit, transform and summary") {
     val predictionColName = "bisecting_kmeans_prediction"
     val bkm = new BisectingKMeans().setK(k).setPredictionCol(predictionColName).setSeed(1)
+
     val model = bkm.fit(dataset)
     assert(model.clusterCenters.length === k)
 
@@ -129,6 +130,7 @@ class BisectingKMeansSuite
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+    assert(summary.numIter == 20)
 
     model.setSummary(None)
     assert(!model.hasSummary)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/GaussianMixtureSuite.scala
@@ -150,6 +150,7 @@ class GaussianMixtureSuite extends SparkFunSuite with MLlibTestSparkContext
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+    assert(summary.numIter == 2)
 
     model.setSummary(None)
     assert(!model.hasSummary)

diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -130,6 +130,7 @@ class KMeansSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultR
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
     assert(clusterSizes.forall(_ >= 0))
+    assert(summary.numIter == 1)
 
     model.setSummary(None)
     assert(!model.hasSummary)

diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
@@ -36,6 +36,11 @@ object MimaExcludes {
 
   // Exclude rules for 2.4.x
   lazy val v24excludes = v23excludes ++ Seq(
+    // [SPARK-23528] Add numIter to ClusteringSummary
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.ClusteringSummary.this"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.KMeansSummary.this"),
+    ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.BisectingKMeansSummary.this"),
+
     // [SPARK-23412][ML] Add cosine distance measure to BisectingKmeans
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasDistanceMeasure.org$apache$spark$ml$param$shared$HasDistanceMeasure$_setter_$distanceMeasure_="),
     ProblemFilters.exclude[InheritedNewAbstractMethodProblem]("org.apache.spark.ml.param.shared.HasDistanceMeasure.getDistanceMeasure"),
-Original file line number
+Diff line change
@@ Expand Up / @@ -348,7 +348,7 @@ class KMeans private ( @@
         logInfo(s"The cost is $cost.")
-        new KMeansModel(centers.map(_.vector), distanceMeasure)
+        new KMeansModel(centers.map(_.vector), distanceMeasure, iteration)
       }
       /**
@@ Expand Down @@