Skip to content

Commit

Permalink
SPARK-4111 [MLlib] add regression metrics
Browse files Browse the repository at this point in the history
Add RegressionMetrics.scala as regression metrics used for evaluation and corresponding test case RegressionMetricsSuite.scala.

Author: Yanbo Liang <[email protected]>
Author: liangyanbo <[email protected]>

Closes #2978 from yanbohappy/regression_metrics and squashes the following commits:

730d0a9 [Yanbo Liang] more clearly annotation
3d0bec1 [Yanbo Liang] rename and keep code style
a8ad3e3 [Yanbo Liang] simplify code for keeping style
d454909 [Yanbo Liang] rename parameter and function names, delete unused columns, add reference
2e56282 [liangyanbo] rename r2_score() and remove unused column
43bb12b [liangyanbo] add regression metrics
  • Loading branch information
Yanbo Liang authored and mengxr committed Oct 30, 2014
1 parent c7ad085 commit d932719
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.mllib.evaluation

import org.apache.spark.annotation.Experimental
import org.apache.spark.rdd.RDD
import org.apache.spark.Logging
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, MultivariateOnlineSummarizer}

/**
* :: Experimental ::
* Evaluator for regression.
*
* @param predictionAndObservations an RDD of (prediction, observation) pairs.
*/
@Experimental
class RegressionMetrics(predictionAndObservations: RDD[(Double, Double)]) extends Logging {

/**
* Use MultivariateOnlineSummarizer to calculate summary statistics of observations and errors.
*/
private lazy val summary: MultivariateStatisticalSummary = {
val summary: MultivariateStatisticalSummary = predictionAndObservations.map {
case (prediction, observation) => Vectors.dense(observation, observation - prediction)
}.aggregate(new MultivariateOnlineSummarizer())(
(summary, v) => summary.add(v),
(sum1, sum2) => sum1.merge(sum2)
)
summary
}

/**
* Returns the explained variance regression score.
* explainedVariance = 1 - variance(y - \hat{y}) / variance(y)
* Reference: [[http://en.wikipedia.org/wiki/Explained_variation]]
*/
def explainedVariance: Double = {
1 - summary.variance(1) / summary.variance(0)
}

/**
* Returns the mean absolute error, which is a risk function corresponding to the
* expected value of the absolute error loss or l1-norm loss.
*/
def meanAbsoluteError: Double = {
summary.normL1(1) / summary.count
}

/**
* Returns the mean squared error, which is a risk function corresponding to the
* expected value of the squared error loss or quadratic loss.
*/
def meanSquaredError: Double = {
val rmse = summary.normL2(1) / math.sqrt(summary.count)
rmse * rmse
}

/**
* Returns the root mean squared error, which is defined as the square root of
* the mean squared error.
*/
def rootMeanSquaredError: Double = {
summary.normL2(1) / math.sqrt(summary.count)
}

/**
* Returns R^2^, the coefficient of determination.
* Reference: [[http://en.wikipedia.org/wiki/Coefficient_of_determination]]
*/
def r2: Double = {
1 - math.pow(summary.normL2(1), 2) / (summary.variance(0) * (summary.count - 1))
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.mllib.evaluation

import org.scalatest.FunSuite

import org.apache.spark.mllib.util.LocalSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class RegressionMetricsSuite extends FunSuite with LocalSparkContext {

test("regression metrics") {
val predictionAndObservations = sc.parallelize(
Seq((2.5,3.0),(0.0,-0.5),(2.0,2.0),(8.0,7.0)), 2)
val metrics = new RegressionMetrics(predictionAndObservations)
assert(metrics.explainedVariance ~== 0.95717 absTol 1E-5,
"explained variance regression score mismatch")
assert(metrics.meanAbsoluteError ~== 0.5 absTol 1E-5, "mean absolute error mismatch")
assert(metrics.meanSquaredError ~== 0.375 absTol 1E-5, "mean squared error mismatch")
assert(metrics.rootMeanSquaredError ~== 0.61237 absTol 1E-5,
"root mean squared error mismatch")
assert(metrics.r2 ~== 0.94861 absTol 1E-5, "r2 score mismatch")
}

test("regression metrics with complete fitting") {
val predictionAndObservations = sc.parallelize(
Seq((3.0,3.0),(0.0,0.0),(2.0,2.0),(8.0,8.0)), 2)
val metrics = new RegressionMetrics(predictionAndObservations)
assert(metrics.explainedVariance ~== 1.0 absTol 1E-5,
"explained variance regression score mismatch")
assert(metrics.meanAbsoluteError ~== 0.0 absTol 1E-5, "mean absolute error mismatch")
assert(metrics.meanSquaredError ~== 0.0 absTol 1E-5, "mean squared error mismatch")
assert(metrics.rootMeanSquaredError ~== 0.0 absTol 1E-5,
"root mean squared error mismatch")
assert(metrics.r2 ~== 1.0 absTol 1E-5, "r2 score mismatch")
}
}

0 comments on commit d932719

Please sign in to comment.