Skip to content

Commit

Permalink
[SPARK-24155][ML] Instrumentation improvements for clustering
Browse files Browse the repository at this point in the history
## What changes were proposed in this pull request?

changed the instrument for all of the clustering methods

## How was this patch tested?

N/A

Please review http://spark.apache.org/contributing.html before opening a pull request.

Author: Lu WANG <[email protected]>

Closes #21218 from ludatabricks/SPARK-23686-1.
  • Loading branch information
lu-wang-dl authored and mengxr committed May 14, 2018
1 parent c26f673 commit 075d678
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -261,8 +261,9 @@ class BisectingKMeans @Since("2.0.0") (
transformSchema(dataset.schema, logging = true)
val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)

val instr = Instrumentation.create(this, rdd)
instr.logParams(featuresCol, predictionCol, k, maxIter, seed, minDivisibleClusterSize)
val instr = Instrumentation.create(this, dataset)
instr.logParams(featuresCol, predictionCol, k, maxIter, seed,
minDivisibleClusterSize, distanceMeasure)

val bkm = new MLlibBisectingKMeans()
.setK($(k))
Expand All @@ -275,6 +276,8 @@ class BisectingKMeans @Since("2.0.0") (
val summary = new BisectingKMeansSummary(
model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
model.setSummary(Some(summary))
// TODO: need to extend logNamedValue to support Array
instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
instr.logSuccess(model)
model
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ class GaussianMixture @Since("2.0.0") (
s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of the covariance" +
s" matrix is quadratic in the number of features.")

val instr = Instrumentation.create(this, instances)
val instr = Instrumentation.create(this, dataset)
instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, seed, tol)
instr.logNumFeatures(numFeatures)

Expand Down Expand Up @@ -425,6 +425,9 @@ class GaussianMixture @Since("2.0.0") (
val summary = new GaussianMixtureSummary(model.transform(dataset),
$(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
model.setSummary(Some(summary))
instr.logNamedValue("logLikelihood", logLikelihood)
// TODO: need to extend logNamedValue to support Array
instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
instr.logSuccess(model)
model
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ class KMeans @Since("1.5.0") (
instances.persist(StorageLevel.MEMORY_AND_DISK)
}

val instr = Instrumentation.create(this, instances)
val instr = Instrumentation.create(this, dataset)
instr.logParams(featuresCol, predictionCol, k, initMode, initSteps, distanceMeasure,
maxIter, seed, tol)
val algo = new MLlibKMeans()
Expand All @@ -359,6 +359,8 @@ class KMeans @Since("1.5.0") (
model.transform(dataset), $(predictionCol), $(featuresCol), $(k))

model.setSummary(Some(summary))
// TODO: need to extend logNamedValue to support Array
instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", ",", "]"))
instr.logSuccess(model)
if (handlePersistence) {
instances.unpersist()
Expand Down

0 comments on commit 075d678

Please sign in to comment.