From 1810dbe8204981986fa2598b5b022a5c74e43732 Mon Sep 17 00:00:00 2001 From: Lu WANG Date: Tue, 1 May 2018 21:11:40 -0700 Subject: [PATCH 1/4] instrumentation improvement for clustering --- .../scala/org/apache/spark/ml/clustering/BisectingKMeans.scala | 2 +- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 2 +- .../src/main/scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index addc12ac52ec1..831f95fc341eb 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -264,7 +264,7 @@ class BisectingKMeans @Since("2.0.0") ( case Row(point: Vector) => OldVectors.fromML(point) } - val instr = Instrumentation.create(this, rdd) + val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize) val bkm = new MLlibBisectingKMeans() diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index b5804900c0358..e23826b8a0bc0 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -350,7 +350,7 @@ class GaussianMixture @Since("2.0.0") ( s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of the covariance" + s" matrix is quadratic in the number of features.") - val instr = Instrumentation.create(this, instances) + val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, seed, tol) instr.logNumFeatures(numFeatures) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index de61c9c089a36..c75f69b417636 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -361,7 +361,7 @@ class KMeans @Since("1.5.0") ( instances.persist(StorageLevel.MEMORY_AND_DISK) } - val instr = Instrumentation.create(this, instances) + val instr = Instrumentation.create(this, dataset) instr.logParams(featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure, maxIter, seed, tol) val algo = new MLlibKMeans() From 07c20a45737f9a4f008eef6da717034670427483 Mon Sep 17 00:00:00 2001 From: Lu WANG Date: Wed, 2 May 2018 14:18:16 -0700 Subject: [PATCH 2/4] add more info for instrument --- .../org/apache/spark/ml/clustering/BisectingKMeans.scala | 4 +++- .../org/apache/spark/ml/clustering/GaussianMixture.scala | 2 ++ .../main/scala/org/apache/spark/ml/clustering/KMeans.scala | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 831f95fc341eb..0b32689e214f3 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -265,7 +265,8 @@ class BisectingKMeans @Since("2.0.0") ( } val instr = Instrumentation.create(this, dataset) - instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize) + instr.logParams(featuresCol, predictionCol, k, maxIter, seed, + minDivisibleClusterSize, distanceMeasure) val bkm = new MLlibBisectingKMeans() .setK($(k)) @@ -278,6 +279,7 @@ class BisectingKMeans @Since("2.0.0") ( val summary = new BisectingKMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) + instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index e23826b8a0bc0..72b18a8fc0844 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -423,6 +423,8 @@ class GaussianMixture @Since("2.0.0") ( val summary = new GaussianMixtureSummary(model.transform(dataset), $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) + instr.logNamedValue("logLikelihood", logLikelihood) + instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index c75f69b417636..1595bb3d30bb4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -378,6 +378,7 @@ class KMeans @Since("1.5.0") ( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) + instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) instr.logSuccess(model) if (handlePersistence) { instances.unpersist() From 90532816e2a986b11042719c37cf033272b790f1 Mon Sep 17 00:00:00 2001 From: Lu WANG Date: Thu, 3 May 2018 11:19:40 -0700 Subject: [PATCH 3/4] change the bug for logging clustersizes --- .../scala/org/apache/spark/ml/clustering/BisectingKMeans.scala | 2 +- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 2 +- .../src/main/scala/org/apache/spark/ml/clustering/KMeans.scala | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 0b32689e214f3..1b7fc7fbe75e4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -279,7 +279,7 @@ class BisectingKMeans @Since("2.0.0") ( val summary = new BisectingKMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) - instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index 72b18a8fc0844..c7d6ecb344345 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -424,7 +424,7 @@ class GaussianMixture @Since("2.0.0") ( $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logNamedValue("logLikelihood", logLikelihood) - instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 1595bb3d30bb4..84f3a6fcecf86 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -378,7 +378,7 @@ class KMeans @Since("1.5.0") ( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) - instr.logNamedValue("clusterSizes", summary.clusterSizes.toString) + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) instr.logSuccess(model) if (handlePersistence) { instances.unpersist() From 4e2cb8141c5a0389fb619f3d423021768de91904 Mon Sep 17 00:00:00 2001 From: Lu WANG Date: Fri, 11 May 2018 13:38:44 -0700 Subject: [PATCH 4/4] change the mkString and add TODO for logging Array "ClusterSizes" --- .../scala/org/apache/spark/ml/clustering/BisectingKMeans.scala | 3 ++- .../scala/org/apache/spark/ml/clustering/GaussianMixture.scala | 3 ++- .../src/main/scala/org/apache/spark/ml/clustering/KMeans.scala | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala index 1b7fc7fbe75e4..34ef4f348e7a5 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala @@ -279,7 +279,8 @@ class BisectingKMeans @Since("2.0.0") ( val summary = new BisectingKMeansSummary( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) - instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) + // TODO: need to extend logNamedValue to support Array + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala index c7d6ecb344345..63160b7761f61 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala @@ -424,7 +424,8 @@ class GaussianMixture @Since("2.0.0") ( $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood) model.setSummary(Some(summary)) instr.logNamedValue("logLikelihood", logLikelihood) - instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) + // TODO: need to extend logNamedValue to support Array + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) instr.logSuccess(model) model } diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 84f3a6fcecf86..333084e2f334d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -378,7 +378,8 @@ class KMeans @Since("1.5.0") ( model.transform(dataset), $(predictionCol), $(featuresCol), $(k)) model.setSummary(Some(summary)) - instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString(", ")) + // TODO: need to extend logNamedValue to support Array + instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]")) instr.logSuccess(model) if (handlePersistence) { instances.unpersist()