diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 0b2dede657e6e..22a77ac824a13 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -37,7 +37,7 @@ private[clustering] trait KMeansParams extends Params with HasMaxIter with HasFeaturesCol with HasSeed with HasPredictionCol { /** - * Param for the column name for the number of clusters to create. + * Set the number of clusters to create (k). Default: 2. * @group param */ val k = new Param[Int](this, "k", "number of clusters to create") @@ -46,7 +46,9 @@ private[clustering] trait KMeansParams def getK: Int = $(k) /** - * Param for the column name for the number of runs of the algorithm to execute in parallel. + * Param the number of runs of the algorithm to execute in parallel. We initialize the algorithm + * this many times with random starting conditions (configured by the initialization mode), then + * return the best clustering found over any run. Default: 1. * @group param */ val runs = new Param[Int](this, "runs", "number of runs of the algorithm to execute in parallel") @@ -55,8 +57,8 @@ private[clustering] trait KMeansParams def getRuns: Int = $(runs) /** - * Param for the column name for the distance threshold - * within which we've consider centers to have converged. + * Param the distance threshold within which we've consider centers to have converged. + * If all centers move less than this Euclidean distance, we stop iterating one run. * @group param */ val epsilon = new Param[Double](this, "epsilon", "distance threshold") @@ -65,7 +67,9 @@ private[clustering] trait KMeansParams def getEpsilon: Double = $(epsilon) /** - * Param for the initialization algorithm. + * Param for the initialization algorithm. This can be either "random" to choose random points as + * initial cluster centers, or "k-means||" to use a parallel variant of k-means++ + * (Bahmani et al., Scalable K-Means++, VLDB 2012). Default: k-means||. * @group param */ val initializationMode = new Param[String](this, "initializationMode", "initialization algorithm") @@ -74,7 +78,8 @@ private[clustering] trait KMeansParams def getInitializationMode: String = $(initializationMode) /** - * Param for the number of steps for k-means initialization mode. + * Param for the number of steps for the k-means|| initialization mode. This is an advanced + * setting -- the default of 5 is almost always enough. Default: 5. * @group param */ val initializationSteps =