forked from apache/spark
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SPARK-7574] [ML] [DOC] User guide for OneVsRest
Including Iris Dataset (after shuffling and relabeling 3 -> 0 to confirm to 0 -> numClasses-1 labeling). Could not find an existing dataset in data/mllib for multiclass classification. Author: Ram Sriharsha <[email protected]> Closes apache#6296 from harsha2010/SPARK-7574 and squashes the following commits: 645427c [Ram Sriharsha] cleanup 46c41b1 [Ram Sriharsha] cleanup 2f76295 [Ram Sriharsha] Code Review Fixes ebdf103 [Ram Sriharsha] Java Example c026613 [Ram Sriharsha] Code Review fixes 4b7d1a6 [Ram Sriharsha] minor cleanup 13bed9c [Ram Sriharsha] add wikipedia link bb9dbfa [Ram Sriharsha] Clean up naming 6f90db1 [Ram Sriharsha] [SPARK-7574][ml][doc] User guide for OneVsRest
- Loading branch information
1 parent
fd758d7
commit dce43d7
Showing
3 changed files
with
281 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
1 1:-0.222222 2:0.5 3:-0.762712 4:-0.833333 | ||
1 1:-0.555556 2:0.25 3:-0.864407 4:-0.916667 | ||
1 1:-0.722222 2:-0.166667 3:-0.864407 4:-0.833333 | ||
1 1:-0.722222 2:0.166667 3:-0.694915 4:-0.916667 | ||
0 1:0.166667 2:-0.416667 3:0.457627 4:0.5 | ||
1 1:-0.833333 3:-0.864407 4:-0.916667 | ||
2 1:-1.32455e-07 2:-0.166667 3:0.220339 4:0.0833333 | ||
2 1:-1.32455e-07 2:-0.333333 3:0.0169491 4:-4.03573e-08 | ||
1 1:-0.5 2:0.75 3:-0.830508 4:-1 | ||
0 1:0.611111 3:0.694915 4:0.416667 | ||
0 1:0.222222 2:-0.166667 3:0.423729 4:0.583333 | ||
1 1:-0.722222 2:-0.166667 3:-0.864407 4:-1 | ||
1 1:-0.5 2:0.166667 3:-0.864407 4:-0.916667 | ||
2 1:-0.222222 2:-0.333333 3:0.0508474 4:-4.03573e-08 | ||
2 1:-0.0555556 2:-0.833333 3:0.0169491 4:-0.25 | ||
2 1:-0.166667 2:-0.416667 3:-0.0169491 4:-0.0833333 | ||
1 1:-0.944444 3:-0.898305 4:-0.916667 | ||
2 1:-0.277778 2:-0.583333 3:-0.0169491 4:-0.166667 | ||
0 1:0.111111 2:-0.333333 3:0.38983 4:0.166667 | ||
2 1:-0.222222 2:-0.166667 3:0.0847457 4:-0.0833333 | ||
0 1:0.166667 2:-0.333333 3:0.559322 4:0.666667 | ||
1 1:-0.611111 2:0.0833333 3:-0.864407 4:-0.916667 | ||
2 1:-0.333333 2:-0.583333 3:0.0169491 4:-4.03573e-08 | ||
0 1:0.555555 2:-0.166667 3:0.661017 4:0.666667 | ||
2 1:0.166667 3:0.186441 4:0.166667 | ||
2 1:0.111111 2:-0.75 3:0.152542 4:-4.03573e-08 | ||
2 1:0.166667 2:-0.25 3:0.118644 4:-4.03573e-08 | ||
0 1:-0.0555556 2:-0.833333 3:0.355932 4:0.166667 | ||
0 1:-0.277778 2:-0.333333 3:0.322034 4:0.583333 | ||
2 1:-0.222222 2:-0.5 3:-0.152542 4:-0.25 | ||
2 1:-0.111111 3:0.288136 4:0.416667 | ||
2 1:-0.0555556 2:-0.25 3:0.186441 4:0.166667 | ||
2 1:0.333333 2:-0.166667 3:0.355932 4:0.333333 | ||
1 1:-0.611111 2:0.25 3:-0.898305 4:-0.833333 | ||
0 1:0.166667 2:-0.333333 3:0.559322 4:0.75 | ||
0 1:0.111111 2:-0.25 3:0.559322 4:0.416667 | ||
0 1:0.833333 2:-0.166667 3:0.898305 4:0.666667 | ||
2 1:-0.277778 2:-0.166667 3:0.186441 4:0.166667 | ||
0 1:-0.666667 2:-0.583333 3:0.186441 4:0.333333 | ||
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 | ||
1 1:-0.166667 2:0.666667 3:-0.932203 4:-0.916667 | ||
0 1:0.0555554 2:-0.333333 3:0.288136 4:0.416667 | ||
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 | ||
1 1:-0.833333 2:0.166667 3:-0.864407 4:-0.833333 | ||
0 1:0.0555554 2:0.166667 3:0.491525 4:0.833333 | ||
0 1:0.722222 2:-0.333333 3:0.728813 4:0.5 | ||
2 1:-0.166667 2:-0.416667 3:0.0508474 4:-0.25 | ||
2 1:0.5 3:0.254237 4:0.0833333 | ||
0 1:0.111111 2:-0.583333 3:0.355932 4:0.5 | ||
1 1:-0.944444 2:-0.166667 3:-0.898305 4:-0.916667 | ||
2 1:0.277778 2:-0.25 3:0.220339 4:-4.03573e-08 | ||
0 1:0.666667 2:-0.25 3:0.79661 4:0.416667 | ||
0 1:0.111111 2:0.0833333 3:0.694915 4:1 | ||
0 1:0.444444 3:0.59322 4:0.833333 | ||
2 1:-0.0555556 2:0.166667 3:0.186441 4:0.25 | ||
1 1:-0.833333 2:0.333333 3:-1 4:-0.916667 | ||
1 1:-0.555556 2:0.416667 3:-0.830508 4:-0.75 | ||
2 1:-0.333333 2:-0.5 3:0.152542 4:-0.0833333 | ||
1 1:-1 2:-0.166667 3:-0.966102 4:-1 | ||
1 1:-0.333333 2:0.25 3:-0.898305 4:-0.916667 | ||
2 1:0.388889 2:-0.333333 3:0.288136 4:0.0833333 | ||
2 1:0.277778 2:-0.166667 3:0.152542 4:0.0833333 | ||
0 1:0.333333 2:0.0833333 3:0.59322 4:0.666667 | ||
1 1:-0.777778 3:-0.79661 4:-0.916667 | ||
1 1:-0.444444 2:0.416667 3:-0.830508 4:-0.916667 | ||
0 1:0.222222 2:-0.166667 3:0.627119 4:0.75 | ||
1 1:-0.555556 2:0.5 3:-0.79661 4:-0.916667 | ||
1 1:-0.555556 2:0.5 3:-0.694915 4:-0.75 | ||
2 1:-1.32455e-07 2:-0.25 3:0.254237 4:0.0833333 | ||
1 1:-0.5 2:0.25 3:-0.830508 4:-0.916667 | ||
0 1:0.166667 3:0.457627 4:0.833333 | ||
2 1:0.444444 2:-0.0833334 3:0.322034 4:0.166667 | ||
0 1:0.111111 2:0.166667 3:0.559322 4:0.916667 | ||
1 1:-0.611111 2:0.25 3:-0.79661 4:-0.583333 | ||
0 1:0.388889 3:0.661017 4:0.833333 | ||
1 1:-0.722222 2:0.166667 3:-0.79661 4:-0.916667 | ||
1 1:-0.722222 2:-0.0833334 3:-0.79661 4:-0.916667 | ||
1 1:-0.555556 2:0.166667 3:-0.830508 4:-0.916667 | ||
2 1:-0.666667 2:-0.666667 3:-0.220339 4:-0.25 | ||
2 1:-0.611111 2:-0.75 3:-0.220339 4:-0.25 | ||
2 1:0.0555554 2:-0.833333 3:0.186441 4:0.166667 | ||
0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 | ||
0 1:0.611111 2:0.333333 3:0.728813 4:1 | ||
2 1:0.0555554 2:-0.25 3:0.118644 4:-4.03573e-08 | ||
1 1:-0.666667 2:-0.166667 3:-0.864407 4:-0.916667 | ||
1 1:-0.833333 2:-0.0833334 3:-0.830508 4:-0.916667 | ||
0 1:0.611111 2:-0.166667 3:0.627119 4:0.25 | ||
0 1:0.888889 2:0.5 3:0.932203 4:0.75 | ||
2 1:0.222222 2:-0.333333 3:0.220339 4:0.166667 | ||
1 1:-0.555556 2:0.25 3:-0.864407 4:-0.833333 | ||
0 1:-1.32455e-07 2:-0.166667 3:0.322034 4:0.416667 | ||
0 1:-1.32455e-07 2:-0.5 3:0.559322 4:0.0833333 | ||
1 1:-0.611111 3:-0.932203 4:-0.916667 | ||
1 1:-0.333333 2:0.833333 3:-0.864407 4:-0.916667 | ||
0 1:-0.166667 2:-0.333333 3:0.38983 4:0.916667 | ||
2 1:-0.333333 2:-0.666667 3:-0.0847458 4:-0.25 | ||
2 1:-0.0555556 2:-0.416667 3:0.38983 4:0.25 | ||
1 1:-0.388889 2:0.416667 3:-0.830508 4:-0.916667 | ||
0 1:0.444444 2:-0.0833334 3:0.38983 4:0.833333 | ||
1 1:-0.611111 2:0.333333 3:-0.864407 4:-0.916667 | ||
0 1:0.111111 2:-0.416667 3:0.322034 4:0.416667 | ||
0 1:0.166667 2:-0.0833334 3:0.525424 4:0.416667 | ||
2 1:0.333333 2:-0.0833334 3:0.152542 4:0.0833333 | ||
0 1:-0.0555556 2:-0.166667 3:0.288136 4:0.416667 | ||
0 1:-0.166667 2:-0.416667 3:0.38983 4:0.5 | ||
1 1:-0.611111 2:0.166667 3:-0.830508 4:-0.916667 | ||
0 1:0.888889 2:-0.166667 3:0.728813 4:0.833333 | ||
2 1:-0.277778 2:-0.25 3:-0.118644 4:-4.03573e-08 | ||
2 1:-0.222222 2:-0.333333 3:0.186441 4:-4.03573e-08 | ||
0 1:0.333333 2:-0.583333 3:0.627119 4:0.416667 | ||
0 1:0.444444 2:-0.0833334 3:0.491525 4:0.666667 | ||
2 1:-0.222222 2:-0.25 3:0.0847457 4:-4.03573e-08 | ||
1 1:-0.611111 2:0.166667 3:-0.79661 4:-0.75 | ||
2 1:-0.277778 2:-0.166667 3:0.0508474 4:-4.03573e-08 | ||
0 1:1 2:0.5 3:0.830508 4:0.583333 | ||
2 1:-0.333333 2:-0.666667 3:-0.0508475 4:-0.166667 | ||
2 1:-0.277778 2:-0.416667 3:0.0847457 4:-4.03573e-08 | ||
0 1:0.888889 2:-0.333333 3:0.932203 4:0.583333 | ||
2 1:-0.111111 2:-0.166667 3:0.0847457 4:0.166667 | ||
2 1:0.111111 2:-0.583333 3:0.322034 4:0.166667 | ||
0 1:0.333333 2:0.0833333 3:0.59322 4:1 | ||
0 1:0.222222 2:-0.166667 3:0.525424 4:0.416667 | ||
1 1:-0.555556 2:0.5 3:-0.830508 4:-0.833333 | ||
0 1:-0.111111 2:-0.166667 3:0.38983 4:0.416667 | ||
0 1:0.888889 2:-0.5 3:1 4:0.833333 | ||
1 1:-0.388889 2:0.583333 3:-0.898305 4:-0.75 | ||
2 1:0.111111 2:0.0833333 3:0.254237 4:0.25 | ||
0 1:0.333333 2:-0.166667 3:0.423729 4:0.833333 | ||
1 1:-0.388889 2:0.166667 3:-0.762712 4:-0.916667 | ||
0 1:0.333333 2:-0.0833334 3:0.559322 4:0.916667 | ||
2 1:-0.333333 2:-0.75 3:0.0169491 4:-4.03573e-08 | ||
1 1:-0.222222 2:1 3:-0.830508 4:-0.75 | ||
1 1:-0.388889 2:0.583333 3:-0.762712 4:-0.75 | ||
2 1:-0.611111 2:-1 3:-0.152542 4:-0.25 | ||
2 1:-1.32455e-07 2:-0.333333 3:0.254237 4:-0.0833333 | ||
2 1:-0.5 2:-0.416667 3:-0.0169491 4:0.0833333 | ||
1 1:-0.888889 2:-0.75 3:-0.898305 4:-0.833333 | ||
1 1:-0.666667 2:-0.0833334 3:-0.830508 4:-1 | ||
2 1:-0.555556 2:-0.583333 3:-0.322034 4:-0.166667 | ||
2 1:-0.166667 2:-0.5 3:0.0169491 4:-0.0833333 | ||
1 1:-0.555556 2:0.0833333 3:-0.762712 4:-0.666667 | ||
1 1:-0.777778 3:-0.898305 4:-0.916667 | ||
0 1:0.388889 2:-0.166667 3:0.525424 4:0.666667 | ||
0 1:0.222222 3:0.38983 4:0.583333 | ||
2 1:0.333333 2:-0.0833334 3:0.254237 4:0.166667 | ||
2 1:-0.388889 2:-0.166667 3:0.186441 4:0.166667 | ||
0 1:-0.222222 2:-0.583333 3:0.355932 4:0.583333 | ||
1 1:-0.611111 2:-0.166667 3:-0.79661 4:-0.916667 | ||
1 1:-0.944444 2:-0.25 3:-0.864407 4:-0.916667 | ||
1 1:-0.388889 2:0.166667 3:-0.830508 4:-0.75 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,129 @@ | ||
--- | ||
layout: global | ||
title: Ensembles | ||
displayTitle: <a href="ml-guide.html">ML</a> - Ensembles | ||
--- | ||
|
||
**Table of Contents** | ||
|
||
* This will become a table of contents (this text will be scraped). | ||
{:toc} | ||
|
||
An [ensemble method](http://en.wikipedia.org/wiki/Ensemble_learning) | ||
is a learning algorithm which creates a model composed of a set of other base models. | ||
The Pipelines API supports the following ensemble algorithms: [`OneVsRest`](api/scala/index.html#org.apache.spark.ml.classifier.OneVsRest) | ||
|
||
## OneVsRest | ||
|
||
[OneVsRest](http://en.wikipedia.org/wiki/Multiclass_classification#One-vs.-rest) is an example of a machine learning reduction for performing multiclass classification given a base classifier that can perform binary classification efficiently. | ||
|
||
`OneVsRest` is implemented as an `Estimator`. For the base classifier it takes instances of `Classifier` and creates a binary classification problem for each of the k classes. The classifier for class i is trained to predict whether the label is i or not, distinguishing class i from all other classes. | ||
|
||
Predictions are done by evaluating each binary classifier and the index of the most confident classifier is output as label. | ||
|
||
### Example | ||
|
||
The example below demonstrates how to load the | ||
[Iris dataset](http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/iris.scale), parse it as a DataFrame and perform multiclass classification using `OneVsRest`. The test error is calculated to measure the algorithm accuracy. | ||
|
||
<div class="codetabs"> | ||
<div data-lang="scala" markdown="1"> | ||
{% highlight scala %} | ||
import org.apache.spark.ml.classification.{LogisticRegression, OneVsRest} | ||
import org.apache.spark.mllib.evaluation.MulticlassMetrics | ||
import org.apache.spark.mllib.util.MLUtils | ||
import org.apache.spark.sql.{Row, SQLContext} | ||
|
||
val sqlContext = new SQLContext(sc) | ||
|
||
// parse data into dataframe | ||
val data = MLUtils.loadLibSVMFile(sc, | ||
"data/mllib/sample_multiclass_classification_data.txt") | ||
val Array(train, test) = data.toDF().randomSplit(Array(0.7, 0.3)) | ||
|
||
// instantiate multiclass learner and train | ||
val ovr = new OneVsRest().setClassifier(new LogisticRegression) | ||
|
||
val ovrModel = ovr.fit(train) | ||
|
||
// score model on test data | ||
val predictions = ovrModel.transform(test).select("prediction", "label") | ||
val predictionsAndLabels = predictions.map {case Row(p: Double, l: Double) => (p, l)} | ||
|
||
// compute confusion matrix | ||
val metrics = new MulticlassMetrics(predictionsAndLabels) | ||
println(metrics.confusionMatrix) | ||
|
||
// the Iris DataSet has three classes | ||
val numClasses = 3 | ||
|
||
println("label\tfpr\n") | ||
(0 until numClasses).foreach { index => | ||
val label = index.toDouble | ||
println(label + "\t" + metrics.falsePositiveRate(label)) | ||
} | ||
{% endhighlight %} | ||
</div> | ||
<div data-lang="java" markdown="1"> | ||
{% highlight java %} | ||
|
||
import org.apache.spark.SparkConf; | ||
import org.apache.spark.api.java.JavaSparkContext; | ||
import org.apache.spark.ml.classification.LogisticRegression; | ||
import org.apache.spark.ml.classification.OneVsRest; | ||
import org.apache.spark.ml.classification.OneVsRestModel; | ||
import org.apache.spark.mllib.evaluation.MulticlassMetrics; | ||
import org.apache.spark.mllib.linalg.Matrix; | ||
import org.apache.spark.mllib.regression.LabeledPoint; | ||
import org.apache.spark.mllib.util.MLUtils; | ||
import org.apache.spark.rdd.RDD; | ||
import org.apache.spark.sql.DataFrame; | ||
import org.apache.spark.sql.SQLContext; | ||
|
||
SparkConf conf = new SparkConf().setAppName("JavaOneVsRestExample"); | ||
JavaSparkContext jsc = new JavaSparkContext(conf); | ||
SQLContext jsql = new SQLContext(jsc); | ||
|
||
RDD<LabeledPoint> data = MLUtils.loadLibSVMFile(jsc.sc(), | ||
"data/mllib/sample_multiclass_classification_data.txt"); | ||
|
||
DataFrame dataFrame = jsql.createDataFrame(data, LabeledPoint.class); | ||
DataFrame[] splits = dataFrame.randomSplit(new double[]{0.7, 0.3}, 12345); | ||
DataFrame train = splits[0]; | ||
DataFrame test = splits[1]; | ||
|
||
// instantiate the One Vs Rest Classifier | ||
OneVsRest ovr = new OneVsRest().setClassifier(new LogisticRegression()); | ||
|
||
// train the multiclass model | ||
OneVsRestModel ovrModel = ovr.fit(train.cache()); | ||
|
||
// score the model on test data | ||
DataFrame predictions = ovrModel | ||
.transform(test) | ||
.select("prediction", "label"); | ||
|
||
// obtain metrics | ||
MulticlassMetrics metrics = new MulticlassMetrics(predictions); | ||
Matrix confusionMatrix = metrics.confusionMatrix(); | ||
|
||
// output the Confusion Matrix | ||
System.out.println("Confusion Matrix"); | ||
System.out.println(confusionMatrix); | ||
|
||
// compute the false positive rate per label | ||
System.out.println(); | ||
System.out.println("label\tfpr\n"); | ||
|
||
// the Iris DataSet has three classes | ||
int numClasses = 3; | ||
for (int index = 0; index < numClasses; index++) { | ||
double label = (double) index; | ||
System.out.print(label); | ||
System.out.print("\t"); | ||
System.out.print(metrics.falsePositiveRate(label)); | ||
System.out.println(); | ||
} | ||
{% endhighlight %} | ||
</div> | ||
</div> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters