[SPARK-24155][ML] Instrumentation improvements for clustering

## What changes were proposed in this pull request? changed the instrument for all of the clustering methods ## How was this patch tested? N/A Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Lu WANG <[email protected]> Closes #21218 from ludatabricks/SPARK-23686-1.
apache · May 14, 2018 · 075d678 · 075d678
1 parent c26f673
commit 075d678
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 4 deletions.
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -261,8 +261,9 @@ class BisectingKMeans @Since("2.0.0") (
     transformSchema(dataset.schema, logging = true)
     val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
 
-    val instr = Instrumentation.create(this, rdd)
-    instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize)
+    val instr = Instrumentation.create(this, dataset)
+    instr.logParams(featuresCol, predictionCol, k, maxIter, seed,
+      minDivisibleClusterSize, distanceMeasure)
 
     val bkm = new MLlibBisectingKMeans()
       .setK($(k))
@@ -275,6 +276,8 @@ class BisectingKMeans @Since("2.0.0") (
     val summary = new BisectingKMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
     model.setSummary(Some(summary))
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
     instr.logSuccess(model)
     model
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -352,7 +352,7 @@ class GaussianMixture @Since("2.0.0") (
       s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of the covariance" +
       s" matrix is quadratic in the number of features.")
 
-    val instr = Instrumentation.create(this, instances)
+    val instr = Instrumentation.create(this, dataset)
     instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, seed, tol)
     instr.logNumFeatures(numFeatures)
 
@@ -425,6 +425,9 @@ class GaussianMixture @Since("2.0.0") (
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
     model.setSummary(Some(summary))
+    instr.logNamedValue("logLikelihood", logLikelihood)
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
     instr.logSuccess(model)
     model
   }

diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -342,7 +342,7 @@ class KMeans @Since("1.5.0") (
       instances.persist(StorageLevel.MEMORY_AND_DISK)
     }
 
-    val instr = Instrumentation.create(this, instances)
+    val instr = Instrumentation.create(this, dataset)
     instr.logParams(featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
       maxIter, seed, tol)
     val algo = new MLlibKMeans()
@@ -359,6 +359,8 @@ class KMeans @Since("1.5.0") (
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 
     model.setSummary(Some(summary))
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
     instr.logSuccess(model)
     if (handlePersistence) {
       instances.unpersist()