From d0885526e16c3d5b798d92bf98873cfd67f15301 Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Sun, 30 Mar 2014 18:39:09 -0700 Subject: [PATCH] use static constructor for MLContext --- .../org/apache/spark/mllib/MLContext.scala | 23 +++++++++++++------ .../apache/spark/mllib/MLContextSuite.scala | 7 +++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala b/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala index fb4d458cd8a09..3cd09b0d48113 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/MLContext.scala @@ -23,7 +23,12 @@ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD -class MLContext(self: SparkContext) { +/** + * Provides methods related to machine learning on top of [[org.apache.spark.SparkContext]]. + * + * @param sparkContext a [[org.apache.spark.SparkContext]] instance + */ +class MLContext(val sparkContext: SparkContext) { /** * Reads labeled data in the LIBSVM format into an RDD[LabeledPoint]. * The LIBSVM format is a text-based format used by LIBSVM and LIBLINEAR. @@ -34,16 +39,16 @@ class MLContext(self: SparkContext) { * where the feature indices are converted to zero-based. * * @param path file or directory path in any Hadoop-supported file system URI - * @param numFeatures number of features, it will be determined from input - * if a non-positive value is given - *@param labelParser parser for labels, default: _.toDouble + * @param numFeatures number of features, which will be determined from the input data if a + * non-positive value is given. The default value is 0. + * @param labelParser parser for labels, default: _.toDouble * @return labeled data stored as an RDD[LabeledPoint] */ def libSVMFile( path: String, - numFeatures: Int, + numFeatures: Int = 0, labelParser: String => Double = _.toDouble): RDD[LabeledPoint] = { - val parsed = self.textFile(path).map(_.trim).filter(!_.isEmpty).map(_.split(' ')) + val parsed = sparkContext.textFile(path).map(_.trim).filter(!_.isEmpty).map(_.split(' ')) // Determine number of features. val d = if (numFeatures > 0) { numFeatures @@ -70,5 +75,9 @@ class MLContext(self: SparkContext) { } object MLContext { - implicit def sparkContextToMLContext(sc: SparkContext): MLContext = new MLContext(sc) + /** + * Creates an [[org.apache.spark.mllib.MLContext]] instance from + * an [[org.apache.spark.SparkContext]] instance. + */ + def apply(sc: SparkContext): MLContext = new MLContext(sc) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala index 743102b54fa9e..05be434590c48 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/MLContextSuite.scala @@ -24,7 +24,6 @@ import org.scalatest.FunSuite import com.google.common.base.Charsets import com.google.common.io.Files -import org.apache.spark.mllib.MLContext._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.util.LocalSparkContext @@ -40,8 +39,10 @@ class MLContextSuite extends FunSuite with LocalSparkContext { val file = new File(tempDir.getPath, "part-00000") Files.write(lines, file, Charsets.US_ASCII) - val pointsWithNumFeatures = sc.libSVMFile(tempDir.toURI.toString, 6).collect() - val pointsWithoutNumFeatures = sc.libSVMFile(tempDir.toURI.toString, 0).collect() + val mlc = MLContext(sc) + + val pointsWithNumFeatures = mlc.libSVMFile(tempDir.toURI.toString, 6).collect() + val pointsWithoutNumFeatures = mlc.libSVMFile(tempDir.toURI.toString, 0).collect() for (points <- Seq(pointsWithNumFeatures, pointsWithoutNumFeatures)) { assert(points.length === 3)