From 8c6c0e193132b90cc4daf422ecc89ae6e2444f6a Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 28 Mar 2014 11:40:43 +0800 Subject: [PATCH 01/38] add basic statistics --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 103 +++++++++++++++++ .../org/apache/spark/mllib/util/MLUtils.scala | 1 + .../mllib/rdd/VectorRDDFunctionsSuite.scala | 109 ++++++++++++++++++ 3 files changed, 213 insertions(+) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala create mode 100644 mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala new file mode 100644 index 0000000000000..829c058ec03c6 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.mllib.rdd + +import breeze.linalg.{Vector => BV, *} + +import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.util.MLUtils._ +import org.apache.spark.rdd.RDD + +/** + * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an implicit conversion. + * Import `org.apache.spark.MLContext._` at the top of your program to use these functions. + */ +class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { + + def rowMeans(): RDD[Double] = { + self.map(x => x.toArray.sum / x.size) + } + + def rowNorm2(): RDD[Double] = { + self.map(x => math.sqrt(x.toArray.map(x => x*x).sum)) + } + + def rowSDs(): RDD[Double] = { + val means = self.rowMeans() + self.zip(means) + .map{ case(x, m) => x.toBreeze - m } + .map{ x => math.sqrt(x.toArray.map(x => x*x).sum / x.size) } + } + + def colMeansOption(): Vector = { + ??? + } + + def colNorm2Option(): Vector = { + ??? + } + + def colSDsOption(): Vector = { + ??? + } + + def colMeans(): Vector = { + Vectors.fromBreeze(self.map(_.toBreeze).zipWithIndex().fold((BV.zeros(1), 0L)) { + case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => + val totalNow: BV[Double] = lhsVec :* lhsCnt.asInstanceOf[Double] + val totalNew: BV[Double] = (totalNow + rhsVec) :/ rhsCnt.asInstanceOf[Double] + (totalNew, rhsCnt) + }._1) + } + + def colNorm2(): Vector = Vectors.fromBreeze( + breezeVector = self.map(_.toBreeze).fold(BV.zeros(1)) { + case (lhs, rhs) => lhs + rhs :* rhs + }.map(math.sqrt)) + + def colSDs(): Vector = { + val means = this.colMeans() + Vectors.fromBreeze( + breezeVector = self.map(x => x.toBreeze - means.toBreeze) + .zipWithIndex() + .fold((BV.zeros(1), 0L)) { + case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => + val totalNow: BV[Double] = lhsVec :* lhsCnt.asInstanceOf[Double] + val totalNew: BV[Double] = (totalNow + rhsVec :* rhsVec) :/ rhsCnt.asInstanceOf[Double] + (totalNew, rhsCnt) + }._1.map(math.sqrt)) + } + + private def maxMinOption(cmp: (Vector, Vector) => Boolean): Option[Vector] = { + def cmpMaxMin(x1: Vector, x2: Vector) = if (cmp(x1, x2)) x1 else x2 + self.mapPartitions { iterator => + Seq(iterator.reduceOption(cmpMaxMin)).iterator + }.collect { case Some(x) => x }.collect().reduceOption(cmpMaxMin) + } + + def maxOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(cmp) + + def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) + + def rowShrink(): RDD[Vector] = { + ??? + } + + def colShrink(): RDD[Vector] = { + ??? + } +} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index ac2360c429e2b..2bc3ab97ca2fc 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -265,4 +265,5 @@ object MLUtils { } sqDist } + implicit def rddToVectorRDDFunctions(rdd: RDD[Vector]) = new VectorRDDFunctions(rdd) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala new file mode 100644 index 0000000000000..465da3e1a2581 --- /dev/null +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.rdd + +import org.apache.spark.mllib.linalg.Vector +import org.scalatest.FunSuite + +import org.apache.spark.mllib.linalg.Vectors +import org.apache.spark.mllib.util.MLUtils._ +import VectorRDDFunctionsSuite._ +import org.apache.spark.mllib.util.LocalSparkContext + +class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { + + val localData = Array( + Vectors.dense(1.0, 2.0, 3.0), + Vectors.dense(4.0, 5.0, 6.0), + Vectors.dense(7.0, 8.0, 9.0) + ) + + val rowMeans = Array(2.0, 5.0, 8.0) + val rowNorm2 = Array(math.sqrt(14.0), math.sqrt(77.0), math.sqrt(194.0)) + val rowSDs = Array(math.sqrt(2.0 / 3.0), math.sqrt(2.0 / 3.0), math.sqrt(2.0 / 3.0)) + + val colMeans = Array(4.0, 5.0, 6.0) + val colNorm2 = Array(math.sqrt(66.0), math.sqrt(93.0), math.sqrt(126.0)) + val colSDs = Array(math.sqrt(6.0), math.sqrt(6.0), math.sqrt(6.0)) + + val maxVec = Array(7.0, 8.0, 9.0) + val minVec = Array(1.0, 2.0, 3.0) + + test("rowMeans") { + val data = sc.parallelize(localData) + assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), "Row means do not match.") + } + + test("rowNorm2") { + val data = sc.parallelize(localData) + assert(equivVector(Vectors.dense(data.rowNorm2().collect()), Vectors.dense(rowNorm2)), "Row norm2s do not match.") + } + + test("rowSDs") { + val data = sc.parallelize(localData) + assert(equivVector(Vectors.dense(data.rowSDs().collect()), Vectors.dense(rowSDs)), "Row SDs do not match.") + } + + test("colMeans") { + val data = sc.parallelize(localData) + assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), "Column means do not match.") + } + + test("colNorm2") { + val data = sc.parallelize(localData) + assert(equivVector(data.colNorm2(), Vectors.dense(colNorm2)), "Column norm2s do not match.") + } + + test("colSDs") { + val data = sc.parallelize(localData) + assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), "Column SDs do not match.") + } + + test("maxOption") { + val data = sc.parallelize(localData) + assert(equivVectorOption( + data.maxOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), + Some(Vectors.dense(maxVec))), + "Optional maximum does not match." + ) + } + + test("minOption") { + val data = sc.parallelize(localData) + assert(equivVectorOption( + data.minOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), + Some(Vectors.dense(minVec))), + "Optional minimum does not match." + ) + } +} + +object VectorRDDFunctionsSuite { + def equivVector(lhs: Vector, rhs: Vector): Boolean = { + (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 + } + + def equivVectorOption(lhs: Option[Vector], rhs: Option[Vector]): Boolean = { + (lhs, rhs) match { + case (Some(a), Some(b)) => (a.toBreeze - a.toBreeze).norm(2) < 1e-9 + case (None, None) => true + case _ => false + } + } +} + From 54b19ab25b937ff89af91438017164ffa94dbe1d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 28 Mar 2014 18:23:54 +0800 Subject: [PATCH 02/38] add new API to shrink RDD[Vector] --- .../apache/spark/mllib/rdd/VectorRDDFunctions.scala | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 829c058ec03c6..c6ac527660709 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -93,11 +93,18 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) - def rowShrink(): RDD[Vector] = { + def rowShrink(): RDD[Vector] = self.filter(x => x.toArray.sum != 0) + + def colShrink(): RDD[Vector] = { + val means = self.colMeans() + self.map( v => Vectors.dense(v.toArray.zip(means.toArray).filter{ case (x, m) => m != 0.0 }.map(_._1))) + } + + def colShrinkWithFilter(): (RDD[Vector], RDD[Boolean]) = { ??? } - def colShrink(): RDD[Vector] = { + def rowShrinkWithFilter(): (RDD[Vector], RDD[Boolean]) = { ??? } } From 28cf060a9adec04447659760efb63cfb1c3963c6 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sat, 29 Mar 2014 09:25:35 +0800 Subject: [PATCH 03/38] fix error of column means --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 69 ++++++++----------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 1 + 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index c6ac527660709..65fe2159517e9 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -43,43 +43,42 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { .map{ x => math.sqrt(x.toArray.map(x => x*x).sum / x.size) } } - def colMeansOption(): Vector = { - ??? + def colMeans(): Vector = colMeans(self.take(1).head.size) + + def colMeans(size: Int): Vector = { + Vectors.fromBreeze(self.map(_.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( + seqOp = (c, v) => (c, v) match { + case ((prev, cnt), current) => + (((prev :* cnt) + current) :/ (cnt + 1.0), cnt + 1.0) + }, + combOp = (lhs, rhs) => (lhs, rhs) match { + case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => + ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) + } + )._1) } - def colNorm2Option(): Vector = { - ??? - } - - def colSDsOption(): Vector = { - ??? - } - - def colMeans(): Vector = { - Vectors.fromBreeze(self.map(_.toBreeze).zipWithIndex().fold((BV.zeros(1), 0L)) { - case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => - val totalNow: BV[Double] = lhsVec :* lhsCnt.asInstanceOf[Double] - val totalNew: BV[Double] = (totalNow + rhsVec) :/ rhsCnt.asInstanceOf[Double] - (totalNew, rhsCnt) - }._1) - } + def colNorm2(): Vector = colNorm2(self.take(1).head.size) - def colNorm2(): Vector = Vectors.fromBreeze( - breezeVector = self.map(_.toBreeze).fold(BV.zeros(1)) { - case (lhs, rhs) => lhs + rhs :* rhs + def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze).fold(BV.zeros[Double](size)) { + case (lhs, rhs) => + lhs + (rhs :* rhs) }.map(math.sqrt)) - def colSDs(): Vector = { + def colSDs(): Vector = colSDs(self.take(1).head.size) + + def colSDs(size: Int): Vector = { val means = this.colMeans() - Vectors.fromBreeze( - breezeVector = self.map(x => x.toBreeze - means.toBreeze) - .zipWithIndex() - .fold((BV.zeros(1), 0L)) { - case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => - val totalNow: BV[Double] = lhsVec :* lhsCnt.asInstanceOf[Double] - val totalNew: BV[Double] = (totalNow + rhsVec :* rhsVec) :/ rhsCnt.asInstanceOf[Double] - (totalNew, rhsCnt) - }._1.map(math.sqrt)) + Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( + seqOp = (c, v) => (c, v) match { + case ((prev, cnt), current) => + (((prev :* cnt) + current) :/ (cnt + 1.0), cnt + 1.0) + }, + combOp = (lhs, rhs) => (lhs, rhs) match { + case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => + ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) + } + )._1.map(math.sqrt)) } private def maxMinOption(cmp: (Vector, Vector) => Boolean): Option[Vector] = { @@ -99,12 +98,4 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { val means = self.colMeans() self.map( v => Vectors.dense(v.toArray.zip(means.toArray).filter{ case (x, m) => m != 0.0 }.map(_._1))) } - - def colShrinkWithFilter(): (RDD[Vector], RDD[Boolean]) = { - ??? - } - - def rowShrinkWithFilter(): (RDD[Vector], RDD[Boolean]) = { - ??? - } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 465da3e1a2581..2a20980634967 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -71,6 +71,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("colSDs") { val data = sc.parallelize(localData) + val test = data.colSDs() assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), "Column SDs do not match.") } From 8ef33777c39a060be18285493450f0ed103b5c22 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sat, 29 Mar 2014 09:42:39 +0800 Subject: [PATCH 04/38] pass all tests --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 12 ++++++------ .../mllib/rdd/VectorRDDFunctionsSuite.scala | 17 ++++++++--------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 65fe2159517e9..6ac0dd5f9b634 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -60,19 +60,19 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { def colNorm2(): Vector = colNorm2(self.take(1).head.size) - def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze).fold(BV.zeros[Double](size)) { - case (lhs, rhs) => - lhs + (rhs :* rhs) - }.map(math.sqrt)) + def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze).aggregate(BV.zeros[Double](size))( + seqOp = (c, v) => c + (v :* v), + combOp = (lhs, rhs) => lhs + rhs + ).map(math.sqrt)) def colSDs(): Vector = colSDs(self.take(1).head.size) def colSDs(size: Int): Vector = { - val means = this.colMeans() + val means = self.colMeans() Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( seqOp = (c, v) => (c, v) match { case ((prev, cnt), current) => - (((prev :* cnt) + current) :/ (cnt + 1.0), cnt + 1.0) + (((prev :* cnt) + (current :* current)) :/ (cnt + 1.0), cnt + 1.0) }, combOp = (lhs, rhs) => (lhs, rhs) match { case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 2a20980634967..20a8f3c94ee69 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -45,38 +45,37 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val minVec = Array(1.0, 2.0, 3.0) test("rowMeans") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), "Row means do not match.") } test("rowNorm2") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVector(Vectors.dense(data.rowNorm2().collect()), Vectors.dense(rowNorm2)), "Row norm2s do not match.") } test("rowSDs") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVector(Vectors.dense(data.rowSDs().collect()), Vectors.dense(rowSDs)), "Row SDs do not match.") } test("colMeans") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), "Column means do not match.") } test("colNorm2") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVector(data.colNorm2(), Vectors.dense(colNorm2)), "Column norm2s do not match.") } test("colSDs") { - val data = sc.parallelize(localData) - val test = data.colSDs() + val data = sc.parallelize(localData, 2) assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), "Column SDs do not match.") } test("maxOption") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVectorOption( data.maxOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), Some(Vectors.dense(maxVec))), @@ -85,7 +84,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { } test("minOption") { - val data = sc.parallelize(localData) + val data = sc.parallelize(localData, 2) assert(equivVectorOption( data.minOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), Some(Vectors.dense(minVec))), From e09d5d279957ff4f097771c47955c5a70df12474 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sat, 29 Mar 2014 10:48:44 +0800 Subject: [PATCH 05/38] add scala docs and refine shrink method --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 63 +++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 6ac0dd5f9b634..9ec7712142b1f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.mllib.rdd -import breeze.linalg.{Vector => BV, *} +import breeze.linalg.{Vector => BV, DenseVector => BDV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils._ @@ -28,14 +28,23 @@ import org.apache.spark.rdd.RDD */ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { + /** + * Compute the mean of each `Vector` in the RDD. + */ def rowMeans(): RDD[Double] = { self.map(x => x.toArray.sum / x.size) } + /** + * Compute the norm-2 of each `Vector` in the RDD. + */ def rowNorm2(): RDD[Double] = { self.map(x => math.sqrt(x.toArray.map(x => x*x).sum)) } + /** + * Compute the standard deviation of each `Vector` in the RDD. + */ def rowSDs(): RDD[Double] = { val means = self.rowMeans() self.zip(means) @@ -43,8 +52,14 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { .map{ x => math.sqrt(x.toArray.map(x => x*x).sum / x.size) } } + /** + * Compute the mean of each column in the RDD. + */ def colMeans(): Vector = colMeans(self.take(1).head.size) + /** + * Compute the mean of each column in the RDD with `size` as the dimension of each `Vector`. + */ def colMeans(size: Int): Vector = { Vectors.fromBreeze(self.map(_.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( seqOp = (c, v) => (c, v) match { @@ -58,15 +73,27 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { )._1) } + /** + * Compute the norm-2 of each column in the RDD. + */ def colNorm2(): Vector = colNorm2(self.take(1).head.size) + /** + * Compute the norm-2 of each column in the RDD with `size` as the dimension of each `Vector`. + */ def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze).aggregate(BV.zeros[Double](size))( seqOp = (c, v) => c + (v :* v), combOp = (lhs, rhs) => lhs + rhs ).map(math.sqrt)) + /** + * Compute the standard deviation of each column in the RDD. + */ def colSDs(): Vector = colSDs(self.take(1).head.size) + /** + * Compute the standard deviation of each column in the RDD with `size` as the dimension of each `Vector`. + */ def colSDs(size: Int): Vector = { val means = self.colMeans() Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( @@ -81,6 +108,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { )._1.map(math.sqrt)) } + /** + * Find the optional max or min vector in the RDD. + */ private def maxMinOption(cmp: (Vector, Vector) => Boolean): Option[Vector] = { def cmpMaxMin(x1: Vector, x2: Vector) = if (cmp(x1, x2)) x1 else x2 self.mapPartitions { iterator => @@ -88,14 +118,39 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { }.collect { case Some(x) => x }.collect().reduceOption(cmpMaxMin) } + /** + * Find the optional max vector in the RDD, `None` will be returned if there is no elements at all. + */ def maxOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(cmp) + /** + * Find the optional min vector in the RDD, `None` will be returned if there is no elements at all. + */ def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) - def rowShrink(): RDD[Vector] = self.filter(x => x.toArray.sum != 0) + /** + * Filter the vectors whose standard deviation is not zero. + */ + def rowShrink(): RDD[Vector] = self.zip(self.rowSDs()).filter(_._2 != 0.0).map(_._1) + /** + * Filter each column of the RDD whose standard deviation is not zero. + */ def colShrink(): RDD[Vector] = { - val means = self.colMeans() - self.map( v => Vectors.dense(v.toArray.zip(means.toArray).filter{ case (x, m) => m != 0.0 }.map(_._1))) + val sds = self.colSDs() + self.take(1).head.toBreeze.isInstanceOf[BDV[Double]] match { + case true => + self.map{ v => + Vectors.dense(v.toArray.zip(sds.toArray).filter{case (x, m) => m != 0.0}.map(_._1)) + } + case false => + self.map { v => + val filtered = v.toArray.zip(sds.toArray).filter{case (x, m) => m != 0.0}.map(_._1) + val denseVector = Vectors.dense(filtered).toBreeze + val size = denseVector.size + val iterElement = denseVector.activeIterator.toSeq + Vectors.sparse(size, iterElement) + } + } } } From ad6c82d1c69d38fee104bbb683624c3fdf31f8a4 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sat, 29 Mar 2014 11:08:33 +0800 Subject: [PATCH 06/38] add shrink test --- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 49 +++++++++++++++---- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 20a8f3c94ee69..e20d52d0b440d 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -17,21 +17,19 @@ package org.apache.spark.mllib.rdd -import org.apache.spark.mllib.linalg.Vector import org.scalatest.FunSuite - -import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.util.MLUtils._ -import VectorRDDFunctionsSuite._ +import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.LocalSparkContext +import org.apache.spark.mllib.util.MLUtils._ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { + import VectorRDDFunctionsSuite._ val localData = Array( - Vectors.dense(1.0, 2.0, 3.0), - Vectors.dense(4.0, 5.0, 6.0), - Vectors.dense(7.0, 8.0, 9.0) - ) + Vectors.dense(1.0, 2.0, 3.0), + Vectors.dense(4.0, 5.0, 6.0), + Vectors.dense(7.0, 8.0, 9.0) + ) val rowMeans = Array(2.0, 5.0, 8.0) val rowNorm2 = Array(math.sqrt(14.0), math.sqrt(77.0), math.sqrt(194.0)) @@ -44,6 +42,23 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val maxVec = Array(7.0, 8.0, 9.0) val minVec = Array(1.0, 2.0, 3.0) + val shrinkingData = Array( + Vectors.dense(1.0, 2.0, 0.0), + Vectors.dense(0.0, 0.0, 0.0), + Vectors.dense(7.0, 8.0, 0.0) + ) + + val rowShrinkData = Array( + Vectors.dense(1.0, 2.0, 0.0), + Vectors.dense(7.0, 8.0, 0.0) + ) + + val colShrinkData = Array( + Vectors.dense(1.0, 2.0), + Vectors.dense(0.0, 0.0), + Vectors.dense(7.0, 8.0) + ) + test("rowMeans") { val data = sc.parallelize(localData, 2) assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), "Row means do not match.") @@ -91,6 +106,22 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { "Optional minimum does not match." ) } + + test("rowShrink") { + val data = sc.parallelize(shrinkingData, 2) + val res = data.rowShrink().collect() + rowShrinkData.zip(res).foreach { case (lhs, rhs) => + assert(equivVector(lhs, rhs), "Row shrink error.") + } + } + + test("columnShrink") { + val data = sc.parallelize(shrinkingData, 2) + val res = data.colShrink().collect() + colShrinkData.zip(res).foreach { case (lhs, rhs) => + assert(equivVector(lhs, rhs), "Column shrink error.") + } + } } object VectorRDDFunctionsSuite { From 9af2e95b52e6e7f676c2fb971a3971b79e3f615c Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sat, 29 Mar 2014 11:40:03 +0800 Subject: [PATCH 07/38] refine the code style --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 46 +++++++++++-------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 18 +++++--- 2 files changed, 39 insertions(+), 25 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 9ec7712142b1f..1f53a60bc3171 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -23,8 +23,9 @@ import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.rdd.RDD /** - * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an implicit conversion. - * Import `org.apache.spark.MLContext._` at the top of your program to use these functions. + * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an + * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use + * these functions. */ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { @@ -81,10 +82,12 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { /** * Compute the norm-2 of each column in the RDD with `size` as the dimension of each `Vector`. */ - def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze).aggregate(BV.zeros[Double](size))( - seqOp = (c, v) => c + (v :* v), - combOp = (lhs, rhs) => lhs + rhs - ).map(math.sqrt)) + def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze) + .aggregate(BV.zeros[Double](size))( + seqOp = (c, v) => c + (v :* v), + combOp = (lhs, rhs) => lhs + rhs + ).map(math.sqrt) + ) /** * Compute the standard deviation of each column in the RDD. @@ -92,20 +95,23 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { def colSDs(): Vector = colSDs(self.take(1).head.size) /** - * Compute the standard deviation of each column in the RDD with `size` as the dimension of each `Vector`. + * Compute the standard deviation of each column in the RDD with `size` as the dimension of each + * `Vector`. */ def colSDs(size: Int): Vector = { val means = self.colMeans() - Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( - seqOp = (c, v) => (c, v) match { - case ((prev, cnt), current) => - (((prev :* cnt) + (current :* current)) :/ (cnt + 1.0), cnt + 1.0) - }, - combOp = (lhs, rhs) => (lhs, rhs) match { - case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => - ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) - } - )._1.map(math.sqrt)) + Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze) + .aggregate((BV.zeros[Double](size), 0.0))( + seqOp = (c, v) => (c, v) match { + case ((prev, cnt), current) => + (((prev :* cnt) + (current :* current)) :/ (cnt + 1.0), cnt + 1.0) + }, + combOp = (lhs, rhs) => (lhs, rhs) match { + case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => + ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) + } + )._1.map(math.sqrt) + ) } /** @@ -119,12 +125,14 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } /** - * Find the optional max vector in the RDD, `None` will be returned if there is no elements at all. + * Find the optional max vector in the RDD, `None` will be returned if there is no elements at + * all. */ def maxOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(cmp) /** - * Find the optional min vector in the RDD, `None` will be returned if there is no elements at all. + * Find the optional min vector in the RDD, `None` will be returned if there is no elements at + * all. */ def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index e20d52d0b440d..f4ff560148ede 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -61,32 +61,38 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("rowMeans") { val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), "Row means do not match.") + assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), + "Row means do not match.") } test("rowNorm2") { val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowNorm2().collect()), Vectors.dense(rowNorm2)), "Row norm2s do not match.") + assert(equivVector(Vectors.dense(data.rowNorm2().collect()), Vectors.dense(rowNorm2)), + "Row norm2s do not match.") } test("rowSDs") { val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowSDs().collect()), Vectors.dense(rowSDs)), "Row SDs do not match.") + assert(equivVector(Vectors.dense(data.rowSDs().collect()), Vectors.dense(rowSDs)), + "Row SDs do not match.") } test("colMeans") { val data = sc.parallelize(localData, 2) - assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), "Column means do not match.") + assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), + "Column means do not match.") } test("colNorm2") { val data = sc.parallelize(localData, 2) - assert(equivVector(data.colNorm2(), Vectors.dense(colNorm2)), "Column norm2s do not match.") + assert(equivVector(data.colNorm2(), Vectors.dense(colNorm2)), + "Column norm2s do not match.") } test("colSDs") { val data = sc.parallelize(localData, 2) - assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), "Column SDs do not match.") + assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), + "Column SDs do not match.") } test("maxOption") { From cc658100c161fee84fff48874715fd542c518db4 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 30 Mar 2014 15:13:28 +0800 Subject: [PATCH 08/38] add parallel mean and variance --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 21 +++++++++++++++++++ .../mllib/rdd/VectorRDDFunctionsSuite.scala | 8 +++++++ 2 files changed, 29 insertions(+) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 1f53a60bc3171..1e941b2429914 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -21,6 +21,7 @@ import breeze.linalg.{Vector => BV, DenseVector => BDV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.rdd.RDD +import breeze.numerics._ /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an @@ -161,4 +162,24 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } } } + + def parallelMeanAndVar(size: Int): (Vector, Vector) = { + val statistics = self.map(_.toBreeze).aggregate((BV.zeros[Double](size), BV.zeros[Double](size), 0.0))( + seqOp = (c, v) => (c, v) match { + case ((prevMean, prevM2n, cnt), currData) => + val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0) + (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0) + }, + combOp = (lhs, rhs) => (lhs, rhs) match { + case ((lhsMean, lhsM2n, lhsCnt), (rhsMean, rhsM2n, rhsCnt)) => + val totalCnt = lhsCnt + rhsCnt + val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt + val deltaMean = rhsMean - lhsMean + val totalM2n = lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt) + (totalMean, totalM2n, totalCnt) + } + ) + + (Vectors.fromBreeze(statistics._1), Vectors.fromBreeze(statistics._2 :/ statistics._3)) + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index f4ff560148ede..1fab692a12533 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -38,6 +38,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val colMeans = Array(4.0, 5.0, 6.0) val colNorm2 = Array(math.sqrt(66.0), math.sqrt(93.0), math.sqrt(126.0)) val colSDs = Array(math.sqrt(6.0), math.sqrt(6.0), math.sqrt(6.0)) + val colVar = Array(6.0, 6.0, 6.0) val maxVec = Array(7.0, 8.0, 9.0) val minVec = Array(1.0, 2.0, 3.0) @@ -128,6 +129,13 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { assert(equivVector(lhs, rhs), "Column shrink error.") } } + + test("meanAndVar") { + val data = sc.parallelize(localData, 2) + val (mean, sd) = data.parallelMeanAndVar(3) + assert(equivVector(mean, Vectors.dense(colMeans)), "Column means do not match.") + assert(equivVector(sd, Vectors.dense(colVar)), "Column SD do not match.") + } } object VectorRDDFunctionsSuite { From 1338ea169e9c70a52100d648234d816e859deb7b Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 30 Mar 2014 16:03:49 +0800 Subject: [PATCH 09/38] all-in-one version test passed --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 27 +++++++++++++------ .../mllib/rdd/VectorRDDFunctionsSuite.scala | 6 ++++- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 1e941b2429914..20af4467d757e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -21,7 +21,7 @@ import breeze.linalg.{Vector => BV, DenseVector => BDV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.rdd.RDD -import breeze.numerics._ +import breeze.linalg._ /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an @@ -163,23 +163,34 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } } - def parallelMeanAndVar(size: Int): (Vector, Vector) = { - val statistics = self.map(_.toBreeze).aggregate((BV.zeros[Double](size), BV.zeros[Double](size), 0.0))( + def parallelMeanAndVar(size: Int): (Vector, Vector, Double, Vector, Vector, Vector) = { + val statistics = self.map(_.toBreeze).aggregate((BV.zeros[Double](size), BV.zeros[Double](size), 0.0, BV.zeros[Double](size), BV.fill(size){Double.MinValue}, BV.fill(size){Double.MaxValue}))( seqOp = (c, v) => (c, v) match { - case ((prevMean, prevM2n, cnt), currData) => + case ((prevMean, prevM2n, cnt, nnz, maxVec, minVec), currData) => val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0) - (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0) + val nonZeroCnt = Vectors.sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze + currData.activeIterator.foreach { case (id, value) => + if (maxVec(id) < value) maxVec(id) = value + if (minVec(id) > value) minVec(id) = value + } + (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0, nnz + nonZeroCnt, maxVec, minVec) }, combOp = (lhs, rhs) => (lhs, rhs) match { - case ((lhsMean, lhsM2n, lhsCnt), (rhsMean, rhsM2n, rhsCnt)) => + case ((lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) => val totalCnt = lhsCnt + rhsCnt val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt val deltaMean = rhsMean - lhsMean val totalM2n = lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt) - (totalMean, totalM2n, totalCnt) + rhsMax.activeIterator.foreach { case (id, value) => + if (lhsMax(id) < value) lhsMax(id) = value + } + rhsMin.activeIterator.foreach { case (id, value) => + if (lhsMin(id) > value) lhsMin(id) = value + } + (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin) } ) - (Vectors.fromBreeze(statistics._1), Vectors.fromBreeze(statistics._2 :/ statistics._3)) + (Vectors.fromBreeze(statistics._1), Vectors.fromBreeze(statistics._2 :/ statistics._3), statistics._3, Vectors.fromBreeze(statistics._4), Vectors.fromBreeze(statistics._5), Vectors.fromBreeze(statistics._6)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 1fab692a12533..0e8a810fa86be 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -132,9 +132,13 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("meanAndVar") { val data = sc.parallelize(localData, 2) - val (mean, sd) = data.parallelMeanAndVar(3) + val (mean, sd, cnt, nnz, max, min) = data.parallelMeanAndVar(3) assert(equivVector(mean, Vectors.dense(colMeans)), "Column means do not match.") assert(equivVector(sd, Vectors.dense(colVar)), "Column SD do not match.") + assert(cnt === 3, "Column cnt do not match.") + assert(equivVector(nnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") + assert(equivVector(max, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") + assert(equivVector(min, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") } } From c4651bbf3a45bd85223dd924eae8978f7ee2617a Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 30 Mar 2014 16:22:41 +0800 Subject: [PATCH 10/38] remove row-wise APIs and refine code --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 98 ++++++++++--------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 39 +------- 2 files changed, 52 insertions(+), 85 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 20af4467d757e..70d982b4a68f6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -21,7 +21,6 @@ import breeze.linalg.{Vector => BV, DenseVector => BDV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.rdd.RDD -import breeze.linalg._ /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an @@ -30,30 +29,6 @@ import breeze.linalg._ */ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { - /** - * Compute the mean of each `Vector` in the RDD. - */ - def rowMeans(): RDD[Double] = { - self.map(x => x.toArray.sum / x.size) - } - - /** - * Compute the norm-2 of each `Vector` in the RDD. - */ - def rowNorm2(): RDD[Double] = { - self.map(x => math.sqrt(x.toArray.map(x => x*x).sum)) - } - - /** - * Compute the standard deviation of each `Vector` in the RDD. - */ - def rowSDs(): RDD[Double] = { - val means = self.rowMeans() - self.zip(means) - .map{ case(x, m) => x.toBreeze - m } - .map{ x => math.sqrt(x.toArray.map(x => x*x).sum / x.size) } - } - /** * Compute the mean of each column in the RDD. */ @@ -137,11 +112,6 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { */ def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) - /** - * Filter the vectors whose standard deviation is not zero. - */ - def rowShrink(): RDD[Vector] = self.zip(self.rowSDs()).filter(_._2 != 0.0).map(_._1) - /** * Filter each column of the RDD whose standard deviation is not zero. */ @@ -163,34 +133,66 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } } - def parallelMeanAndVar(size: Int): (Vector, Vector, Double, Vector, Vector, Vector) = { - val statistics = self.map(_.toBreeze).aggregate((BV.zeros[Double](size), BV.zeros[Double](size), 0.0, BV.zeros[Double](size), BV.fill(size){Double.MinValue}, BV.fill(size){Double.MaxValue}))( + /** + * Compute full column-wise statistics for the RDD, including + * {{{ + * Mean: Vector, + * Variance: Vector, + * Count: Double, + * Non-zero count: Vector, + * Maximum elements: Vector, + * Minimum elements: Vector. + * }}}, + * with the size of Vector as input parameter. + */ + def statistics(size: Int): (Vector, Vector, Double, Vector, Vector, Vector) = { + val results = self.map(_.toBreeze).aggregate(( + BV.zeros[Double](size), + BV.zeros[Double](size), + 0.0, + BV.zeros[Double](size), + BV.fill(size){Double.MinValue}, + BV.fill(size){Double.MaxValue}))( seqOp = (c, v) => (c, v) match { - case ((prevMean, prevM2n, cnt, nnz, maxVec, minVec), currData) => + case ((prevMean, prevM2n, cnt, nnzVec, maxVec, minVec), currData) => val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0) - val nonZeroCnt = Vectors.sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze + val nonZeroCnt = Vectors + .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze currData.activeIterator.foreach { case (id, value) => if (maxVec(id) < value) maxVec(id) = value if (minVec(id) > value) minVec(id) = value } - (currMean, prevM2n + ((currData - prevMean) :* (currData - currMean)), cnt + 1.0, nnz + nonZeroCnt, maxVec, minVec) + (currMean, + prevM2n + ((currData - prevMean) :* (currData - currMean)), + cnt + 1.0, + nnzVec + nonZeroCnt, + maxVec, + minVec) }, combOp = (lhs, rhs) => (lhs, rhs) match { - case ((lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) => - val totalCnt = lhsCnt + rhsCnt - val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt - val deltaMean = rhsMean - lhsMean - val totalM2n = lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt) - rhsMax.activeIterator.foreach { case (id, value) => - if (lhsMax(id) < value) lhsMax(id) = value - } - rhsMin.activeIterator.foreach { case (id, value) => - if (lhsMin(id) > value) lhsMin(id) = value - } - (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin) + case ( + (lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), + (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) => + val totalCnt = lhsCnt + rhsCnt + val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt + val deltaMean = rhsMean - lhsMean + val totalM2n = + lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt) + rhsMax.activeIterator.foreach { case (id, value) => + if (lhsMax(id) < value) lhsMax(id) = value + } + rhsMin.activeIterator.foreach { case (id, value) => + if (lhsMin(id) > value) lhsMin(id) = value + } + (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin) } ) - (Vectors.fromBreeze(statistics._1), Vectors.fromBreeze(statistics._2 :/ statistics._3), statistics._3, Vectors.fromBreeze(statistics._4), Vectors.fromBreeze(statistics._5), Vectors.fromBreeze(statistics._6)) + (Vectors.fromBreeze(results._1), + Vectors.fromBreeze(results._2 :/ results._3), + results._3, + Vectors.fromBreeze(results._4), + Vectors.fromBreeze(results._5), + Vectors.fromBreeze(results._6)) } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 0e8a810fa86be..72a2f062b875c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -31,10 +31,6 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { Vectors.dense(7.0, 8.0, 9.0) ) - val rowMeans = Array(2.0, 5.0, 8.0) - val rowNorm2 = Array(math.sqrt(14.0), math.sqrt(77.0), math.sqrt(194.0)) - val rowSDs = Array(math.sqrt(2.0 / 3.0), math.sqrt(2.0 / 3.0), math.sqrt(2.0 / 3.0)) - val colMeans = Array(4.0, 5.0, 6.0) val colNorm2 = Array(math.sqrt(66.0), math.sqrt(93.0), math.sqrt(126.0)) val colSDs = Array(math.sqrt(6.0), math.sqrt(6.0), math.sqrt(6.0)) @@ -49,35 +45,12 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { Vectors.dense(7.0, 8.0, 0.0) ) - val rowShrinkData = Array( - Vectors.dense(1.0, 2.0, 0.0), - Vectors.dense(7.0, 8.0, 0.0) - ) - val colShrinkData = Array( Vectors.dense(1.0, 2.0), Vectors.dense(0.0, 0.0), Vectors.dense(7.0, 8.0) ) - test("rowMeans") { - val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowMeans().collect()), Vectors.dense(rowMeans)), - "Row means do not match.") - } - - test("rowNorm2") { - val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowNorm2().collect()), Vectors.dense(rowNorm2)), - "Row norm2s do not match.") - } - - test("rowSDs") { - val data = sc.parallelize(localData, 2) - assert(equivVector(Vectors.dense(data.rowSDs().collect()), Vectors.dense(rowSDs)), - "Row SDs do not match.") - } - test("colMeans") { val data = sc.parallelize(localData, 2) assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), @@ -114,14 +87,6 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { ) } - test("rowShrink") { - val data = sc.parallelize(shrinkingData, 2) - val res = data.rowShrink().collect() - rowShrinkData.zip(res).foreach { case (lhs, rhs) => - assert(equivVector(lhs, rhs), "Row shrink error.") - } - } - test("columnShrink") { val data = sc.parallelize(shrinkingData, 2) val res = data.colShrink().collect() @@ -130,9 +95,9 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { } } - test("meanAndVar") { + test("full-statistics") { val data = sc.parallelize(localData, 2) - val (mean, sd, cnt, nnz, max, min) = data.parallelMeanAndVar(3) + val (mean, sd, cnt, nnz, max, min) = data.statistics(3) assert(equivVector(mean, Vectors.dense(colMeans)), "Column means do not match.") assert(equivVector(sd, Vectors.dense(colVar)), "Column SD do not match.") assert(cnt === 3, "Column cnt do not match.") From d816ac7aa2501678dfd4f1a281f2321bc3e0c9c2 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Sun, 30 Mar 2014 16:42:33 +0800 Subject: [PATCH 11/38] remove useless APIs --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 107 +----------------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 78 +------------ 2 files changed, 4 insertions(+), 181 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 70d982b4a68f6..34f8e31b8600a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -16,10 +16,9 @@ */ package org.apache.spark.mllib.rdd -import breeze.linalg.{Vector => BV, DenseVector => BDV} +import breeze.linalg.{Vector => BV} import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.util.MLUtils._ import org.apache.spark.rdd.RDD /** @@ -29,110 +28,6 @@ import org.apache.spark.rdd.RDD */ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { - /** - * Compute the mean of each column in the RDD. - */ - def colMeans(): Vector = colMeans(self.take(1).head.size) - - /** - * Compute the mean of each column in the RDD with `size` as the dimension of each `Vector`. - */ - def colMeans(size: Int): Vector = { - Vectors.fromBreeze(self.map(_.toBreeze).aggregate((BV.zeros[Double](size), 0.0))( - seqOp = (c, v) => (c, v) match { - case ((prev, cnt), current) => - (((prev :* cnt) + current) :/ (cnt + 1.0), cnt + 1.0) - }, - combOp = (lhs, rhs) => (lhs, rhs) match { - case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => - ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) - } - )._1) - } - - /** - * Compute the norm-2 of each column in the RDD. - */ - def colNorm2(): Vector = colNorm2(self.take(1).head.size) - - /** - * Compute the norm-2 of each column in the RDD with `size` as the dimension of each `Vector`. - */ - def colNorm2(size: Int): Vector = Vectors.fromBreeze(self.map(_.toBreeze) - .aggregate(BV.zeros[Double](size))( - seqOp = (c, v) => c + (v :* v), - combOp = (lhs, rhs) => lhs + rhs - ).map(math.sqrt) - ) - - /** - * Compute the standard deviation of each column in the RDD. - */ - def colSDs(): Vector = colSDs(self.take(1).head.size) - - /** - * Compute the standard deviation of each column in the RDD with `size` as the dimension of each - * `Vector`. - */ - def colSDs(size: Int): Vector = { - val means = self.colMeans() - Vectors.fromBreeze(self.map(x => x.toBreeze - means.toBreeze) - .aggregate((BV.zeros[Double](size), 0.0))( - seqOp = (c, v) => (c, v) match { - case ((prev, cnt), current) => - (((prev :* cnt) + (current :* current)) :/ (cnt + 1.0), cnt + 1.0) - }, - combOp = (lhs, rhs) => (lhs, rhs) match { - case ((lhsVec, lhsCnt), (rhsVec, rhsCnt)) => - ((lhsVec :* lhsCnt) + (rhsVec :* rhsCnt) :/ (lhsCnt + rhsCnt), lhsCnt + rhsCnt) - } - )._1.map(math.sqrt) - ) - } - - /** - * Find the optional max or min vector in the RDD. - */ - private def maxMinOption(cmp: (Vector, Vector) => Boolean): Option[Vector] = { - def cmpMaxMin(x1: Vector, x2: Vector) = if (cmp(x1, x2)) x1 else x2 - self.mapPartitions { iterator => - Seq(iterator.reduceOption(cmpMaxMin)).iterator - }.collect { case Some(x) => x }.collect().reduceOption(cmpMaxMin) - } - - /** - * Find the optional max vector in the RDD, `None` will be returned if there is no elements at - * all. - */ - def maxOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(cmp) - - /** - * Find the optional min vector in the RDD, `None` will be returned if there is no elements at - * all. - */ - def minOption(cmp: (Vector, Vector) => Boolean) = maxMinOption(!cmp(_, _)) - - /** - * Filter each column of the RDD whose standard deviation is not zero. - */ - def colShrink(): RDD[Vector] = { - val sds = self.colSDs() - self.take(1).head.toBreeze.isInstanceOf[BDV[Double]] match { - case true => - self.map{ v => - Vectors.dense(v.toArray.zip(sds.toArray).filter{case (x, m) => m != 0.0}.map(_._1)) - } - case false => - self.map { v => - val filtered = v.toArray.zip(sds.toArray).filter{case (x, m) => m != 0.0}.map(_._1) - val denseVector = Vectors.dense(filtered).toBreeze - val size = denseVector.size - val iterElement = denseVector.activeIterator.toSeq - Vectors.sparse(size, iterElement) - } - } - } - /** * Compute full column-wise statistics for the RDD, including * {{{ diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 72a2f062b875c..8a2ba192ce734 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -31,75 +31,11 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { Vectors.dense(7.0, 8.0, 9.0) ) - val colMeans = Array(4.0, 5.0, 6.0) - val colNorm2 = Array(math.sqrt(66.0), math.sqrt(93.0), math.sqrt(126.0)) - val colSDs = Array(math.sqrt(6.0), math.sqrt(6.0), math.sqrt(6.0)) - val colVar = Array(6.0, 6.0, 6.0) - - val maxVec = Array(7.0, 8.0, 9.0) - val minVec = Array(1.0, 2.0, 3.0) - - val shrinkingData = Array( - Vectors.dense(1.0, 2.0, 0.0), - Vectors.dense(0.0, 0.0, 0.0), - Vectors.dense(7.0, 8.0, 0.0) - ) - - val colShrinkData = Array( - Vectors.dense(1.0, 2.0), - Vectors.dense(0.0, 0.0), - Vectors.dense(7.0, 8.0) - ) - - test("colMeans") { - val data = sc.parallelize(localData, 2) - assert(equivVector(data.colMeans(), Vectors.dense(colMeans)), - "Column means do not match.") - } - - test("colNorm2") { - val data = sc.parallelize(localData, 2) - assert(equivVector(data.colNorm2(), Vectors.dense(colNorm2)), - "Column norm2s do not match.") - } - - test("colSDs") { - val data = sc.parallelize(localData, 2) - assert(equivVector(data.colSDs(), Vectors.dense(colSDs)), - "Column SDs do not match.") - } - - test("maxOption") { - val data = sc.parallelize(localData, 2) - assert(equivVectorOption( - data.maxOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), - Some(Vectors.dense(maxVec))), - "Optional maximum does not match." - ) - } - - test("minOption") { - val data = sc.parallelize(localData, 2) - assert(equivVectorOption( - data.minOption((lhs: Vector, rhs: Vector) => lhs.toBreeze.norm(2) >= rhs.toBreeze.norm(2)), - Some(Vectors.dense(minVec))), - "Optional minimum does not match." - ) - } - - test("columnShrink") { - val data = sc.parallelize(shrinkingData, 2) - val res = data.colShrink().collect() - colShrinkData.zip(res).foreach { case (lhs, rhs) => - assert(equivVector(lhs, rhs), "Column shrink error.") - } - } - test("full-statistics") { val data = sc.parallelize(localData, 2) - val (mean, sd, cnt, nnz, max, min) = data.statistics(3) - assert(equivVector(mean, Vectors.dense(colMeans)), "Column means do not match.") - assert(equivVector(sd, Vectors.dense(colVar)), "Column SD do not match.") + val (mean, variance, cnt, nnz, max, min) = data.statistics(3) + assert(equivVector(mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") + assert(equivVector(variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") assert(cnt === 3, "Column cnt do not match.") assert(equivVector(nnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") assert(equivVector(max, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") @@ -111,13 +47,5 @@ object VectorRDDFunctionsSuite { def equivVector(lhs: Vector, rhs: Vector): Boolean = { (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 } - - def equivVectorOption(lhs: Option[Vector], rhs: Option[Vector]): Boolean = { - (lhs, rhs) match { - case (Some(a), Some(b)) => (a.toBreeze - a.toBreeze).norm(2) < 1e-9 - case (None, None) => true - case _ => false - } - } } From 9a75ebdde74c9d64a04a9bc1219fe1b9b305e0eb Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 13:36:57 +0800 Subject: [PATCH 12/38] add case class to wrap return values --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 15 ++++++++++++--- .../spark/mllib/rdd/VectorRDDFunctionsSuite.scala | 2 +- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 34f8e31b8600a..5499d1d9ea5e6 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -21,6 +21,14 @@ import breeze.linalg.{Vector => BV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD +case class VectorRDDStatisticalSummary( + mean: Vector, + variance: Vector, + count: Long, + max: Vector, + min: Vector, + nonZeroCnt: Vector) extends Serializable + /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use @@ -40,7 +48,7 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * }}}, * with the size of Vector as input parameter. */ - def statistics(size: Int): (Vector, Vector, Double, Vector, Vector, Vector) = { + def summarizeStatistics(size: Int): VectorRDDStatisticalSummary = { val results = self.map(_.toBreeze).aggregate(( BV.zeros[Double](size), BV.zeros[Double](size), @@ -83,9 +91,10 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } ) - (Vectors.fromBreeze(results._1), + VectorRDDStatisticalSummary( + Vectors.fromBreeze(results._1), Vectors.fromBreeze(results._2 :/ results._3), - results._3, + results._3.toLong, Vectors.fromBreeze(results._4), Vectors.fromBreeze(results._5), Vectors.fromBreeze(results._6)) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 8a2ba192ce734..087bb8a6ba4f1 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -33,7 +33,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("full-statistics") { val data = sc.parallelize(localData, 2) - val (mean, variance, cnt, nnz, max, min) = data.statistics(3) + val VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min) = data.summarizeStatistics(3) assert(equivVector(mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") assert(equivVector(variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") assert(cnt === 3, "Column cnt do not match.") From 62a2c3ef5244e288960fb4198be07bb0c3f53304 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 13:59:22 +0800 Subject: [PATCH 13/38] use axpy and in-place if possible --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 5499d1d9ea5e6..7e369a7e58077 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -20,6 +20,7 @@ import breeze.linalg.{Vector => BV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD +import breeze.linalg.axpy case class VectorRDDStatisticalSummary( mean: Vector, @@ -58,17 +59,22 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size){Double.MaxValue}))( seqOp = (c, v) => (c, v) match { case ((prevMean, prevM2n, cnt, nnzVec, maxVec, minVec), currData) => - val currMean = ((prevMean :* cnt) + currData) :/ (cnt + 1.0) - val nonZeroCnt = Vectors - .sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze + val currMean = prevMean :* (cnt / (cnt + 1.0)) + axpy(1.0/(cnt+1.0), currData, currMean) + axpy(-1.0, currData, prevMean) + prevMean :*= (currMean - currData) + axpy(1.0, prevMean, prevM2n) + axpy(1.0, + Vectors.sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze, + nnzVec) currData.activeIterator.foreach { case (id, value) => if (maxVec(id) < value) maxVec(id) = value if (minVec(id) > value) minVec(id) = value } (currMean, - prevM2n + ((currData - prevMean) :* (currData - currMean)), + prevM2n, cnt + 1.0, - nnzVec + nonZeroCnt, + nnzVec, maxVec, minVec) }, @@ -77,23 +83,30 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { (lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) => val totalCnt = lhsCnt + rhsCnt - val totalMean = (lhsMean :* lhsCnt) + (rhsMean :* rhsCnt) :/ totalCnt val deltaMean = rhsMean - lhsMean - val totalM2n = - lhsM2n + rhsM2n + (((deltaMean :* deltaMean) :* (lhsCnt * rhsCnt)) :/ totalCnt) + lhsMean :*= (lhsCnt / totalCnt) + axpy(rhsCnt/totalCnt, rhsMean, lhsMean) + val totalMean = lhsMean + deltaMean :*= deltaMean + axpy(lhsCnt*rhsCnt/totalCnt, deltaMean, lhsM2n) + axpy(1.0, rhsM2n, lhsM2n) + val totalM2n = lhsM2n rhsMax.activeIterator.foreach { case (id, value) => if (lhsMax(id) < value) lhsMax(id) = value } rhsMin.activeIterator.foreach { case (id, value) => if (lhsMin(id) > value) lhsMin(id) = value } - (totalMean, totalM2n, totalCnt, lhsNNZ + rhsNNZ, lhsMax, lhsMin) + axpy(1.0, rhsNNZ, lhsNNZ) + (totalMean, totalM2n, totalCnt, lhsNNZ, lhsMax, lhsMin) } ) + results._2 :/= results._3 + VectorRDDStatisticalSummary( Vectors.fromBreeze(results._1), - Vectors.fromBreeze(results._2 :/ results._3), + Vectors.fromBreeze(results._2), results._3.toLong, Vectors.fromBreeze(results._4), Vectors.fromBreeze(results._5), From 3980287f8765d6c8bc14e98a97b4b8da07f7c8e8 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 14:01:55 +0800 Subject: [PATCH 14/38] rename variables --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 7e369a7e58077..ff1d1853c6307 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -78,27 +78,27 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { maxVec, minVec) }, - combOp = (lhs, rhs) => (lhs, rhs) match { + combOp = (c, v) => (c, v) match { case ( - (lhsMean, lhsM2n, lhsCnt, lhsNNZ, lhsMax, lhsMin), - (rhsMean, rhsM2n, rhsCnt, rhsNNZ, rhsMax, rhsMin)) => - val totalCnt = lhsCnt + rhsCnt - val deltaMean = rhsMean - lhsMean - lhsMean :*= (lhsCnt / totalCnt) - axpy(rhsCnt/totalCnt, rhsMean, lhsMean) - val totalMean = lhsMean + (mean1, m2n1, cnt1, nnz1, max1, min1), + (mean2, m2n2, cnt2, nnz2, max2, min2)) => + val totalCnt = cnt1 + cnt2 + val deltaMean = mean2 - mean1 + mean1 :*= (cnt1 / totalCnt) + axpy(cnt2/totalCnt, mean2, mean1) + val totalMean = mean1 deltaMean :*= deltaMean - axpy(lhsCnt*rhsCnt/totalCnt, deltaMean, lhsM2n) - axpy(1.0, rhsM2n, lhsM2n) - val totalM2n = lhsM2n - rhsMax.activeIterator.foreach { case (id, value) => - if (lhsMax(id) < value) lhsMax(id) = value + axpy(cnt1*cnt2/totalCnt, deltaMean, m2n1) + axpy(1.0, m2n2, m2n1) + val totalM2n = m2n1 + max2.activeIterator.foreach { case (id, value) => + if (max1(id) < value) max1(id) = value } - rhsMin.activeIterator.foreach { case (id, value) => - if (lhsMin(id) > value) lhsMin(id) = value + min2.activeIterator.foreach { case (id, value) => + if (min1(id) > value) min1(id) = value } - axpy(1.0, rhsNNZ, lhsNNZ) - (totalMean, totalM2n, totalCnt, lhsNNZ, lhsMax, lhsMin) + axpy(1.0, nnz2, nnz1) + (totalMean, totalM2n, totalCnt, nnz1, max1, min1) } ) From a6d5a2e646abf549e69d92cb0551a0f5c13b8930 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 15:13:05 +0800 Subject: [PATCH 15/38] rewrite for only computing non-zero elements --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 53 +++++++++---------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index ff1d1853c6307..f62ee1fb9b0d8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -50,7 +50,7 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * with the size of Vector as input parameter. */ def summarizeStatistics(size: Int): VectorRDDStatisticalSummary = { - val results = self.map(_.toBreeze).aggregate(( + val (fakeMean, fakeM2n, totalCnt, nnz, max, min) = self.map(_.toBreeze).aggregate(( BV.zeros[Double](size), BV.zeros[Double](size), 0.0, @@ -59,19 +59,16 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size){Double.MaxValue}))( seqOp = (c, v) => (c, v) match { case ((prevMean, prevM2n, cnt, nnzVec, maxVec, minVec), currData) => - val currMean = prevMean :* (cnt / (cnt + 1.0)) - axpy(1.0/(cnt+1.0), currData, currMean) - axpy(-1.0, currData, prevMean) - prevMean :*= (currMean - currData) - axpy(1.0, prevMean, prevM2n) - axpy(1.0, - Vectors.sparse(size, currData.activeKeysIterator.toSeq.map(x => (x, 1.0))).toBreeze, - nnzVec) - currData.activeIterator.foreach { case (id, value) => + currData.activeIterator.map{ case (id, value) => + val tmpPrevMean = prevMean(id) + prevMean(id) = (prevMean(id) * cnt + value) / (cnt + 1.0) if (maxVec(id) < value) maxVec(id) = value if (minVec(id) > value) minVec(id) = value + nnzVec(id) += 1.0 + prevM2n(id) += (value - prevMean(id)) * (value - tmpPrevMean) } - (currMean, + + (prevMean, prevM2n, cnt + 1.0, nnzVec, @@ -84,32 +81,34 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { (mean2, m2n2, cnt2, nnz2, max2, min2)) => val totalCnt = cnt1 + cnt2 val deltaMean = mean2 - mean1 - mean1 :*= (cnt1 / totalCnt) - axpy(cnt2/totalCnt, mean2, mean1) - val totalMean = mean1 - deltaMean :*= deltaMean - axpy(cnt1*cnt2/totalCnt, deltaMean, m2n1) - axpy(1.0, m2n2, m2n1) - val totalM2n = m2n1 + val totalMean = ((mean1 :* nnz1) + (mean2 :* nnz2)) :/ (nnz1 + nnz2) + val totalM2n = m2n1 + m2n2 + ((deltaMean :* deltaMean) :* (nnz1 :* nnz2) :/ (nnz1 + nnz2)) max2.activeIterator.foreach { case (id, value) => if (max1(id) < value) max1(id) = value } min2.activeIterator.foreach { case (id, value) => if (min1(id) > value) min1(id) = value } - axpy(1.0, nnz2, nnz1) - (totalMean, totalM2n, totalCnt, nnz1, max1, min1) + (totalMean, totalM2n, totalCnt, nnz1 + nnz2, max1, min1) } ) - results._2 :/= results._3 + // solve real mean + val realMean = fakeMean :* nnz :/ totalCnt + // solve real variance + val deltaMean = fakeMean :- 0.0 + val realVar = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) + max :+= 0.0 + min :+= 0.0 + + realVar :/= totalCnt VectorRDDStatisticalSummary( - Vectors.fromBreeze(results._1), - Vectors.fromBreeze(results._2), - results._3.toLong, - Vectors.fromBreeze(results._4), - Vectors.fromBreeze(results._5), - Vectors.fromBreeze(results._6)) + Vectors.fromBreeze(realMean), + Vectors.fromBreeze(realVar), + totalCnt.toLong, + Vectors.fromBreeze(nnz), + Vectors.fromBreeze(max), + Vectors.fromBreeze(min)) } } From 4e4fbd12095884d8ec1d3a7dfa38efb9682710dc Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 16:10:27 +0800 Subject: [PATCH 16/38] separate seqop and combop out as independent functions --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 102 +++++++++++------- 1 file changed, 62 insertions(+), 40 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index f62ee1fb9b0d8..8ab93630a2463 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -30,6 +30,14 @@ case class VectorRDDStatisticalSummary( min: Vector, nonZeroCnt: Vector) extends Serializable +private case class VectorRDDStatisticalRing( + fakeMean: BV[Double], + fakeM2n: BV[Double], + totalCnt: Double, + nnz: BV[Double], + max: BV[Double], + min: BV[Double]) + /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use @@ -49,57 +57,71 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * }}}, * with the size of Vector as input parameter. */ + + private def seqOp(aggregator: VectorRDDStatisticalRing, currData: BV[Double]): VectorRDDStatisticalRing = { + aggregator match { + case VectorRDDStatisticalRing(prevMean, prevM2n, cnt, nnzVec, maxVec, minVec) => + currData.activeIterator.foreach { + case (id, value) => + if (maxVec(id) < value) maxVec(id) = value + if (minVec(id) > value) minVec(id) = value + + val tmpPrevMean = prevMean(id) + prevMean(id) = (prevMean(id) * cnt + value) / (cnt + 1.0) + prevM2n(id) += (value - prevMean(id)) * (value - tmpPrevMean) + + nnzVec(id) += 1.0 + } + + VectorRDDStatisticalRing(prevMean, + prevM2n, + cnt + 1.0, + nnzVec, + maxVec, + minVec) + } + } + + private def combOp(statistics1: VectorRDDStatisticalRing, statistics2: VectorRDDStatisticalRing): VectorRDDStatisticalRing = { + (statistics1, statistics2) match { + case (VectorRDDStatisticalRing(mean1, m2n1, cnt1, nnz1, max1, min1), + VectorRDDStatisticalRing(mean2, m2n2, cnt2, nnz2, max2, min2)) => + val totalCnt = cnt1 + cnt2 + val deltaMean = mean2 - mean1 + val totalMean = ((mean1 :* nnz1) + (mean2 :* nnz2)) :/ (nnz1 + nnz2) + val totalM2n = m2n1 + m2n2 + ((deltaMean :* deltaMean) :* (nnz1 :* nnz2) :/ (nnz1 + nnz2)) + max2.activeIterator.foreach { + case (id, value) => + if (max1(id) < value) max1(id) = value + } + min2.activeIterator.foreach { + case (id, value) => + if (min1(id) > value) min1(id) = value + } + VectorRDDStatisticalRing(totalMean, totalM2n, totalCnt, nnz1 + nnz2, max1, min1) + } + } + def summarizeStatistics(size: Int): VectorRDDStatisticalSummary = { - val (fakeMean, fakeM2n, totalCnt, nnz, max, min) = self.map(_.toBreeze).aggregate(( + val zeroValue = VectorRDDStatisticalRing( BV.zeros[Double](size), BV.zeros[Double](size), 0.0, BV.zeros[Double](size), - BV.fill(size){Double.MinValue}, - BV.fill(size){Double.MaxValue}))( - seqOp = (c, v) => (c, v) match { - case ((prevMean, prevM2n, cnt, nnzVec, maxVec, minVec), currData) => - currData.activeIterator.map{ case (id, value) => - val tmpPrevMean = prevMean(id) - prevMean(id) = (prevMean(id) * cnt + value) / (cnt + 1.0) - if (maxVec(id) < value) maxVec(id) = value - if (minVec(id) > value) minVec(id) = value - nnzVec(id) += 1.0 - prevM2n(id) += (value - prevMean(id)) * (value - tmpPrevMean) - } - - (prevMean, - prevM2n, - cnt + 1.0, - nnzVec, - maxVec, - minVec) - }, - combOp = (c, v) => (c, v) match { - case ( - (mean1, m2n1, cnt1, nnz1, max1, min1), - (mean2, m2n2, cnt2, nnz2, max2, min2)) => - val totalCnt = cnt1 + cnt2 - val deltaMean = mean2 - mean1 - val totalMean = ((mean1 :* nnz1) + (mean2 :* nnz2)) :/ (nnz1 + nnz2) - val totalM2n = m2n1 + m2n2 + ((deltaMean :* deltaMean) :* (nnz1 :* nnz2) :/ (nnz1 + nnz2)) - max2.activeIterator.foreach { case (id, value) => - if (max1(id) < value) max1(id) = value - } - min2.activeIterator.foreach { case (id, value) => - if (min1(id) > value) min1(id) = value - } - (totalMean, totalM2n, totalCnt, nnz1 + nnz2, max1, min1) - } - ) + BV.fill(size)(Double.MinValue), + BV.fill(size)(Double.MaxValue)) + + val breezeVectors = self.collect().map(_.toBreeze) + val VectorRDDStatisticalRing(fakeMean, fakeM2n, totalCnt, nnz, max, min) = breezeVectors.aggregate(zeroValue)(seqOp, combOp) // solve real mean val realMean = fakeMean :* nnz :/ totalCnt // solve real variance val deltaMean = fakeMean :- 0.0 val realVar = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) - max :+= 0.0 - min :+= 0.0 + // max, min process, in case of a column is all zero. + // max :+= 0.0 + // min :+= 0.0 realVar :/= totalCnt From 4cfbadf963c04dd88d8677a784492a4adf84a57f Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 16:47:25 +0800 Subject: [PATCH 17/38] fix bug of min max --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 8ab93630a2463..a39b6f81cf6ed 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -20,7 +20,6 @@ import breeze.linalg.{Vector => BV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD -import breeze.linalg.axpy case class VectorRDDStatisticalSummary( mean: Vector, @@ -35,8 +34,8 @@ private case class VectorRDDStatisticalRing( fakeM2n: BV[Double], totalCnt: Double, nnz: BV[Double], - max: BV[Double], - min: BV[Double]) + fakeMax: BV[Double], + fakeMin: BV[Double]) /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an @@ -58,7 +57,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * with the size of Vector as input parameter. */ - private def seqOp(aggregator: VectorRDDStatisticalRing, currData: BV[Double]): VectorRDDStatisticalRing = { + private def seqOp( + aggregator: VectorRDDStatisticalRing, + currData: BV[Double]): VectorRDDStatisticalRing = { aggregator match { case VectorRDDStatisticalRing(prevMean, prevM2n, cnt, nnzVec, maxVec, minVec) => currData.activeIterator.foreach { @@ -73,7 +74,8 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { nnzVec(id) += 1.0 } - VectorRDDStatisticalRing(prevMean, + VectorRDDStatisticalRing( + prevMean, prevM2n, cnt + 1.0, nnzVec, @@ -82,7 +84,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } } - private def combOp(statistics1: VectorRDDStatisticalRing, statistics2: VectorRDDStatisticalRing): VectorRDDStatisticalRing = { + private def combOp( + statistics1: VectorRDDStatisticalRing, + statistics2: VectorRDDStatisticalRing): VectorRDDStatisticalRing = { (statistics1, statistics2) match { case (VectorRDDStatisticalRing(mean1, m2n1, cnt1, nnz1, max1, min1), VectorRDDStatisticalRing(mean2, m2n2, cnt2, nnz2, max2, min2)) => @@ -111,18 +115,26 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size)(Double.MinValue), BV.fill(size)(Double.MaxValue)) - val breezeVectors = self.collect().map(_.toBreeze) - val VectorRDDStatisticalRing(fakeMean, fakeM2n, totalCnt, nnz, max, min) = breezeVectors.aggregate(zeroValue)(seqOp, combOp) + val breezeVectors = self.map(_.toBreeze) + val VectorRDDStatisticalRing(fakeMean, fakeM2n, totalCnt, nnz, fakeMax, fakeMin) = + breezeVectors.aggregate(zeroValue)(seqOp, combOp) // solve real mean val realMean = fakeMean :* nnz :/ totalCnt // solve real variance val deltaMean = fakeMean :- 0.0 val realVar = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) - // max, min process, in case of a column is all zero. - // max :+= 0.0 - // min :+= 0.0 - + // max, min + val max = Vectors.sparse(size, fakeMax.activeIterator.map { case (id, value) => + if ((value == Double.MinValue) && (realMean(id) != Double.MinValue)) (id, 0.0) + else (id, value) + }.toSeq) + val min = Vectors.sparse(size, fakeMin.activeIterator.map { case (id, value) => + if ((value == Double.MaxValue) && (realMean(id) != Double.MaxValue)) (id, 0.0) + else (id, value) + }.toSeq) + + // get variance realVar :/= totalCnt VectorRDDStatisticalSummary( @@ -130,7 +142,7 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { Vectors.fromBreeze(realVar), totalCnt.toLong, Vectors.fromBreeze(nnz), - Vectors.fromBreeze(max), - Vectors.fromBreeze(min)) + max, + min) } } From f6e8e9aed0d912e4bde8897c553670a735571f62 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 17:31:42 +0800 Subject: [PATCH 18/38] add sparse vectors test --- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 087bb8a6ba4f1..c3a4710d3a9f0 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -21,6 +21,7 @@ import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.LocalSparkContext import org.apache.spark.mllib.util.MLUtils._ +import scala.collection.mutable.ArrayBuffer class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { import VectorRDDFunctionsSuite._ @@ -31,19 +32,47 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { Vectors.dense(7.0, 8.0, 9.0) ) + val sparseData = ArrayBuffer(Vectors.sparse(20, Seq((0, 1.0), (9, 2.0), (10, 7.0)))) + for (i <- 0 to 10000) sparseData += Vectors.sparse(20, Seq((9, 0.0))) + sparseData += Vectors.sparse(20, Seq((0, 5.0), (9, 13.0), (16, 2.0))) + sparseData += Vectors.sparse(20, Seq((3, 5.0), (9, 13.0), (18, 2.0))) + test("full-statistics") { val data = sc.parallelize(localData, 2) - val VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min) = data.summarizeStatistics(3) + val (VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min), denseTime) = time(data.summarizeStatistics(3)) assert(equivVector(mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") assert(equivVector(variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") assert(cnt === 3, "Column cnt do not match.") assert(equivVector(nnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") assert(equivVector(max, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") assert(equivVector(min, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") + + val dataForSparse = sc.parallelize(sparseData.toSeq, 2) + val (VectorRDDStatisticalSummary(sparseMean, sparseVariance, sparseCnt, sparseNnz, sparseMax, sparseMin), sparseTime) = time(dataForSparse.summarizeStatistics(20)) + /* + assert(equivVector(sparseMean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") + assert(equivVector(sparseVariance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") + assert(sparseCnt === 3, "Column cnt do not match.") + assert(equivVector(sparseNnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") + assert(equivVector(sparseMax, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") + assert(equivVector(sparseMin, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") + */ + + + + println(s"dense time is $denseTime, sparse time is $sparseTime.") } + } object VectorRDDFunctionsSuite { + def time[R](block: => R): (R, Double) = { + val t0 = System.nanoTime() + val result = block + val t1 = System.nanoTime() + (result, (t1 - t0).toDouble / 1.0e9) + } + def equivVector(lhs: Vector, rhs: Vector): Boolean = { (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 } From 036b7a5cbced3d6a582ce7d3b7cdec4f4ab3a577 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 20:53:48 +0800 Subject: [PATCH 19/38] fix the bug of Nan occur --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index a39b6f81cf6ed..029ef263d5d80 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.mllib.rdd -import breeze.linalg.{Vector => BV} +import breeze.linalg.{Vector => BV, axpy} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD @@ -92,8 +92,14 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { VectorRDDStatisticalRing(mean2, m2n2, cnt2, nnz2, max2, min2)) => val totalCnt = cnt1 + cnt2 val deltaMean = mean2 - mean1 - val totalMean = ((mean1 :* nnz1) + (mean2 :* nnz2)) :/ (nnz1 + nnz2) - val totalM2n = m2n1 + m2n2 + ((deltaMean :* deltaMean) :* (nnz1 :* nnz2) :/ (nnz1 + nnz2)) + mean2.activeIterator.foreach { + case (id, 0.0) => + case (id, value) => mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id)) + } + m2n2.activeIterator.foreach { + case (id, 0.0) => + case (id, value) => m2n1(id) += value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+nnz2(id)) + } max2.activeIterator.foreach { case (id, value) => if (max1(id) < value) max1(id) = value @@ -102,7 +108,8 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { case (id, value) => if (min1(id) > value) min1(id) = value } - VectorRDDStatisticalRing(totalMean, totalM2n, totalCnt, nnz1 + nnz2, max1, min1) + axpy(1.0, nnz2, nnz1) + VectorRDDStatisticalRing(mean1, m2n1, totalCnt, nnz1, max1, min1) } } From 4a5c38dd0066468d140d8bcd9fa45ba29f5e8726 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Tue, 1 Apr 2014 21:28:34 +0800 Subject: [PATCH 20/38] add scala doc, refine code and comments --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 59 ++++++++++++------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 34 ++++++----- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 029ef263d5d80..51dc0111a448a 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -16,11 +16,15 @@ */ package org.apache.spark.mllib.rdd -import breeze.linalg.{Vector => BV, axpy} +import breeze.linalg.{axpy, Vector => BV} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD +/** + * Case class of the summary statistics, including mean, variance, count, max, min, and non-zero + * elements count. + */ case class VectorRDDStatisticalSummary( mean: Vector, variance: Vector, @@ -29,6 +33,12 @@ case class VectorRDDStatisticalSummary( min: Vector, nonZeroCnt: Vector) extends Serializable +/** + * Case class of the aggregate value for collecting summary statistics from RDD[Vector]. These + * values are relatively with + * [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary]], the + * latter is computed from the former. + */ private case class VectorRDDStatisticalRing( fakeMean: BV[Double], fakeM2n: BV[Double], @@ -45,18 +55,8 @@ private case class VectorRDDStatisticalRing( class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { /** - * Compute full column-wise statistics for the RDD, including - * {{{ - * Mean: Vector, - * Variance: Vector, - * Count: Double, - * Non-zero count: Vector, - * Maximum elements: Vector, - * Minimum elements: Vector. - * }}}, - * with the size of Vector as input parameter. + * Aggregate function used for aggregating elements in a worker together. */ - private def seqOp( aggregator: VectorRDDStatisticalRing, currData: BV[Double]): VectorRDDStatisticalRing = { @@ -84,6 +84,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } } + /** + * Combine function used for combining intermediate results together from every worker. + */ private def combOp( statistics1: VectorRDDStatisticalRing, statistics2: VectorRDDStatisticalRing): VectorRDDStatisticalRing = { @@ -92,27 +95,38 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { VectorRDDStatisticalRing(mean2, m2n2, cnt2, nnz2, max2, min2)) => val totalCnt = cnt1 + cnt2 val deltaMean = mean2 - mean1 + mean2.activeIterator.foreach { case (id, 0.0) => - case (id, value) => mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id)) + case (id, value) => + mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id)) } + m2n2.activeIterator.foreach { case (id, 0.0) => - case (id, value) => m2n1(id) += value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+nnz2(id)) + case (id, value) => + m2n1(id) += + value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+nnz2(id)) } + max2.activeIterator.foreach { case (id, value) => if (max1(id) < value) max1(id) = value } + min2.activeIterator.foreach { case (id, value) => if (min1(id) > value) min1(id) = value } + axpy(1.0, nnz2, nnz1) VectorRDDStatisticalRing(mean1, m2n1, totalCnt, nnz1, max1, min1) } } + /** + * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. + */ def summarizeStatistics(size: Int): VectorRDDStatisticalSummary = { val zeroValue = VectorRDDStatisticalRing( BV.zeros[Double](size), @@ -122,16 +136,17 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size)(Double.MinValue), BV.fill(size)(Double.MaxValue)) - val breezeVectors = self.map(_.toBreeze) val VectorRDDStatisticalRing(fakeMean, fakeM2n, totalCnt, nnz, fakeMax, fakeMin) = - breezeVectors.aggregate(zeroValue)(seqOp, combOp) + self.map(_.toBreeze).aggregate(zeroValue)(seqOp, combOp) // solve real mean val realMean = fakeMean :* nnz :/ totalCnt - // solve real variance - val deltaMean = fakeMean :- 0.0 - val realVar = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) - // max, min + + // solve real m2n + val deltaMean = fakeMean + val realM2n = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) + + // remove the initial value in max and min, i.e. the Double.MaxValue or Double.MinValue. val max = Vectors.sparse(size, fakeMax.activeIterator.map { case (id, value) => if ((value == Double.MinValue) && (realMean(id) != Double.MinValue)) (id, 0.0) else (id, value) @@ -142,11 +157,11 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { }.toSeq) // get variance - realVar :/= totalCnt + realM2n :/= totalCnt VectorRDDStatisticalSummary( Vectors.fromBreeze(realMean), - Vectors.fromBreeze(realVar), + Vectors.fromBreeze(realM2n), totalCnt.toLong, Vectors.fromBreeze(nnz), max, diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index c3a4710d3a9f0..b23d3afdc840a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -17,12 +17,18 @@ package org.apache.spark.mllib.rdd +import scala.collection.mutable.ArrayBuffer + import org.scalatest.FunSuite + import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.LocalSparkContext import org.apache.spark.mllib.util.MLUtils._ -import scala.collection.mutable.ArrayBuffer +/** + * Test suite for the summary statistics of RDD[Vector]. Both the accuracy and the time consuming + * between dense and sparse vector are tested. + */ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { import VectorRDDFunctionsSuite._ @@ -33,13 +39,15 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { ) val sparseData = ArrayBuffer(Vectors.sparse(20, Seq((0, 1.0), (9, 2.0), (10, 7.0)))) - for (i <- 0 to 10000) sparseData += Vectors.sparse(20, Seq((9, 0.0))) + for (i <- 0 until 10000) sparseData += Vectors.sparse(20, Seq((9, 0.0))) sparseData += Vectors.sparse(20, Seq((0, 5.0), (9, 13.0), (16, 2.0))) sparseData += Vectors.sparse(20, Seq((3, 5.0), (9, 13.0), (18, 2.0))) test("full-statistics") { val data = sc.parallelize(localData, 2) - val (VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min), denseTime) = time(data.summarizeStatistics(3)) + val (VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min), denseTime) = + time(data.summarizeStatistics(3)) + assert(equivVector(mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") assert(equivVector(variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") assert(cnt === 3, "Column cnt do not match.") @@ -48,21 +56,12 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { assert(equivVector(min, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") val dataForSparse = sc.parallelize(sparseData.toSeq, 2) - val (VectorRDDStatisticalSummary(sparseMean, sparseVariance, sparseCnt, sparseNnz, sparseMax, sparseMin), sparseTime) = time(dataForSparse.summarizeStatistics(20)) - /* - assert(equivVector(sparseMean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") - assert(equivVector(sparseVariance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") - assert(sparseCnt === 3, "Column cnt do not match.") - assert(equivVector(sparseNnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") - assert(equivVector(sparseMax, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") - assert(equivVector(sparseMin, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") - */ - - + val (_, sparseTime) = time(dataForSparse.summarizeStatistics(20)) println(s"dense time is $denseTime, sparse time is $sparseTime.") + assert(relativeTime(denseTime, sparseTime), + "Relative time between dense and sparse vector doesn't match.") } - } object VectorRDDFunctionsSuite { @@ -76,5 +75,10 @@ object VectorRDDFunctionsSuite { def equivVector(lhs: Vector, rhs: Vector): Boolean = { (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 } + + def relativeTime(lhs: Double, rhs: Double): Boolean = { + val denominator = math.max(lhs, rhs) + math.abs(lhs - rhs) / denominator < 0.3 + } } From 1376ff4d34810c78235ee2664f9bb207975600ff Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 11:03:13 +0800 Subject: [PATCH 21/38] rename variables and adjust code --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 92 ++++++++----------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 27 +++--- 2 files changed, 54 insertions(+), 65 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 51dc0111a448a..25a423862a660 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -18,34 +18,20 @@ package org.apache.spark.mllib.rdd import breeze.linalg.{axpy, Vector => BV} -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD /** * Case class of the summary statistics, including mean, variance, count, max, min, and non-zero * elements count. */ -case class VectorRDDStatisticalSummary( - mean: Vector, - variance: Vector, - count: Long, - max: Vector, - min: Vector, - nonZeroCnt: Vector) extends Serializable - -/** - * Case class of the aggregate value for collecting summary statistics from RDD[Vector]. These - * values are relatively with - * [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary]], the - * latter is computed from the former. - */ -private case class VectorRDDStatisticalRing( - fakeMean: BV[Double], - fakeM2n: BV[Double], - totalCnt: Double, - nnz: BV[Double], - fakeMax: BV[Double], - fakeMin: BV[Double]) +case class VectorRDDStatisticalAggregator( + mean: BV[Double], + statCounter: BV[Double], + totalCount: Double, + numNonZeros: BV[Double], + max: BV[Double], + min: BV[Double]) /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an @@ -58,11 +44,12 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * Aggregate function used for aggregating elements in a worker together. */ private def seqOp( - aggregator: VectorRDDStatisticalRing, - currData: BV[Double]): VectorRDDStatisticalRing = { + aggregator: VectorRDDStatisticalAggregator, + currData: BV[Double]): VectorRDDStatisticalAggregator = { aggregator match { - case VectorRDDStatisticalRing(prevMean, prevM2n, cnt, nnzVec, maxVec, minVec) => + case VectorRDDStatisticalAggregator(prevMean, prevM2n, cnt, nnzVec, maxVec, minVec) => currData.activeIterator.foreach { + case (id, 0.0) => case (id, value) => if (maxVec(id) < value) maxVec(id) = value if (minVec(id) > value) minVec(id) = value @@ -74,7 +61,7 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { nnzVec(id) += 1.0 } - VectorRDDStatisticalRing( + VectorRDDStatisticalAggregator( prevMean, prevM2n, cnt + 1.0, @@ -88,11 +75,11 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { * Combine function used for combining intermediate results together from every worker. */ private def combOp( - statistics1: VectorRDDStatisticalRing, - statistics2: VectorRDDStatisticalRing): VectorRDDStatisticalRing = { + statistics1: VectorRDDStatisticalAggregator, + statistics2: VectorRDDStatisticalAggregator): VectorRDDStatisticalAggregator = { (statistics1, statistics2) match { - case (VectorRDDStatisticalRing(mean1, m2n1, cnt1, nnz1, max1, min1), - VectorRDDStatisticalRing(mean2, m2n2, cnt2, nnz2, max2, min2)) => + case (VectorRDDStatisticalAggregator(mean1, m2n1, cnt1, nnz1, max1, min1), + VectorRDDStatisticalAggregator(mean2, m2n2, cnt2, nnz2, max2, min2)) => val totalCnt = cnt1 + cnt2 val deltaMean = mean2 - mean1 @@ -120,15 +107,16 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { } axpy(1.0, nnz2, nnz1) - VectorRDDStatisticalRing(mean1, m2n1, totalCnt, nnz1, max1, min1) + VectorRDDStatisticalAggregator(mean1, m2n1, totalCnt, nnz1, max1, min1) } } /** * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. */ - def summarizeStatistics(size: Int): VectorRDDStatisticalSummary = { - val zeroValue = VectorRDDStatisticalRing( + def summarizeStatistics(): VectorRDDStatisticalAggregator = { + val size = self.take(1).head.size + val zeroValue = VectorRDDStatisticalAggregator( BV.zeros[Double](size), BV.zeros[Double](size), 0.0, @@ -136,35 +124,33 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size)(Double.MinValue), BV.fill(size)(Double.MaxValue)) - val VectorRDDStatisticalRing(fakeMean, fakeM2n, totalCnt, nnz, fakeMax, fakeMin) = + val VectorRDDStatisticalAggregator(currMean, currM2n, totalCnt, nnz, currMax, currMin) = self.map(_.toBreeze).aggregate(zeroValue)(seqOp, combOp) // solve real mean - val realMean = fakeMean :* nnz :/ totalCnt + val realMean = currMean :* nnz :/ totalCnt // solve real m2n - val deltaMean = fakeMean - val realM2n = fakeM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) + val deltaMean = currMean + val realM2n = currM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) // remove the initial value in max and min, i.e. the Double.MaxValue or Double.MinValue. - val max = Vectors.sparse(size, fakeMax.activeIterator.map { case (id, value) => - if ((value == Double.MinValue) && (realMean(id) != Double.MinValue)) (id, 0.0) - else (id, value) - }.toSeq) - val min = Vectors.sparse(size, fakeMin.activeIterator.map { case (id, value) => - if ((value == Double.MaxValue) && (realMean(id) != Double.MaxValue)) (id, 0.0) - else (id, value) - }.toSeq) + nnz.activeIterator.foreach { + case (id, 0.0) => + currMax(id) = 0.0 + currMin(id) = 0.0 + case _ => + } // get variance realM2n :/= totalCnt - VectorRDDStatisticalSummary( - Vectors.fromBreeze(realMean), - Vectors.fromBreeze(realM2n), - totalCnt.toLong, - Vectors.fromBreeze(nnz), - max, - min) + VectorRDDStatisticalAggregator( + realMean, + realM2n, + totalCnt, + nnz, + currMax, + currMin) } -} +} \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index b23d3afdc840a..e9923aaecc992 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.spark.mllib.rdd import scala.collection.mutable.ArrayBuffer @@ -45,18 +44,23 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("full-statistics") { val data = sc.parallelize(localData, 2) - val (VectorRDDStatisticalSummary(mean, variance, cnt, nnz, max, min), denseTime) = - time(data.summarizeStatistics(3)) + val (VectorRDDStatisticalAggregator(mean, variance, cnt, nnz, max, min), denseTime) = + time(data.summarizeStatistics()) - assert(equivVector(mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") - assert(equivVector(variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") - assert(cnt === 3, "Column cnt do not match.") - assert(equivVector(nnz, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") - assert(equivVector(max, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") - assert(equivVector(min, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") + assert(equivVector(Vectors.fromBreeze(mean), Vectors.dense(4.0, 5.0, 6.0)), + "Column mean do not match.") + assert(equivVector(Vectors.fromBreeze(variance), Vectors.dense(6.0, 6.0, 6.0)), + "Column variance do not match.") + assert(cnt === 3.0, "Column cnt do not match.") + assert(equivVector(Vectors.fromBreeze(nnz), Vectors.dense(3.0, 3.0, 3.0)), + "Column nnz do not match.") + assert(equivVector(Vectors.fromBreeze(max), Vectors.dense(7.0, 8.0, 9.0)), + "Column max do not match.") + assert(equivVector(Vectors.fromBreeze(min), Vectors.dense(1.0, 2.0, 3.0)), + "Column min do not match.") val dataForSparse = sc.parallelize(sparseData.toSeq, 2) - val (_, sparseTime) = time(dataForSparse.summarizeStatistics(20)) + val (_, sparseTime) = time(dataForSparse.summarizeStatistics()) println(s"dense time is $denseTime, sparse time is $sparseTime.") assert(relativeTime(denseTime, sparseTime), @@ -80,5 +84,4 @@ object VectorRDDFunctionsSuite { val denominator = math.max(lhs, rhs) math.abs(lhs - rhs) / denominator < 0.3 } -} - +} \ No newline at end of file From 138300c143ef3943d466f9e31a747b47e5ae13cf Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 17:35:38 +0800 Subject: [PATCH 22/38] add new Aggregator class --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 101 ++++++++++++++++-- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 4 +- 2 files changed, 96 insertions(+), 9 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 25a423862a660..b5518e4a91a4f 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -18,20 +18,109 @@ package org.apache.spark.mllib.rdd import breeze.linalg.{axpy, Vector => BV} -import org.apache.spark.mllib.linalg.Vector +import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.rdd.RDD /** * Case class of the summary statistics, including mean, variance, count, max, min, and non-zero * elements count. */ +trait VectorRDDStatisticalSummary { + def mean(): Vector + def variance(): Vector + def totalCount(): Long + def numNonZeros(): Vector + def max(): Vector + def min(): Vector +} + +private class Aggregator( + val currMean: BV[Double], + val currM2n: BV[Double], + var totalCnt: Double, + val nnz: BV[Double], + val currMax: BV[Double], + val currMin: BV[Double]) extends VectorRDDStatisticalSummary { + nnz.activeIterator.foreach { + case (id, 0.0) => + currMax(id) = 0.0 + currMin(id) = 0.0 + case _ => + } + override def mean(): Vector = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) + override def variance(): Vector = { + val deltaMean = currMean + val realM2n = currM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) + realM2n :/= totalCnt + Vectors.fromBreeze(realM2n) + } + + override def totalCount(): Long = totalCnt.toLong + + override def numNonZeros(): Vector = Vectors.fromBreeze(nnz) + override def max(): Vector = Vectors.fromBreeze(currMax) + override def min(): Vector = Vectors.fromBreeze(currMin) + /** + * Aggregate function used for aggregating elements in a worker together. + */ + def add(currData: BV[Double]): this.type = { + currData.activeIterator.foreach { + case (id, 0.0) => + case (id, value) => + if (currMax(id) < value) currMax(id) = value + if (currMin(id) > value) currMin(id) = value + + val tmpPrevMean = currMean(id) + currMean(id) = (currMean(id) * totalCnt + value) / (totalCnt + 1.0) + currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) + + nnz(id) += 1.0 + totalCnt += 1.0 + } + this + } + /** + * Combine function used for combining intermediate results together from every worker. + */ + def merge(other: this.type): this.type = { + totalCnt += other.totalCnt + val deltaMean = currMean - other.currMean + + other.currMean.activeIterator.foreach { + case (id, 0.0) => + case (id, value) => + currMean(id) = (currMean(id) * nnz(id) + other.currMean(id) * other.nnz(id)) / (nnz(id) + other.nnz(id)) + } + + other.currM2n.activeIterator.foreach { + case (id, 0.0) => + case (id, value) => + currM2n(id) += + value + deltaMean(id) * deltaMean(id) * nnz(id) * other.nnz(id) / (nnz(id)+other.nnz(id)) + } + + other.currMax.activeIterator.foreach { + case (id, value) => + if (currMax(id) < value) currMax(id) = value + } + + other.currMin.activeIterator.foreach { + case (id, value) => + if (currMin(id) > value) currMin(id) = value + } + + axpy(1.0, other.nnz, nnz) + this + } +} + case class VectorRDDStatisticalAggregator( mean: BV[Double], - statCounter: BV[Double], - totalCount: Double, - numNonZeros: BV[Double], - max: BV[Double], - min: BV[Double]) + statCnt: BV[Double], + totalCnt: Double, + nnz: BV[Double], + currMax: BV[Double], + currMin: BV[Double]) /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index e9923aaecc992..49cde4b4e11d4 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -38,7 +38,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { ) val sparseData = ArrayBuffer(Vectors.sparse(20, Seq((0, 1.0), (9, 2.0), (10, 7.0)))) - for (i <- 0 until 10000) sparseData += Vectors.sparse(20, Seq((9, 0.0))) + for (i <- 0 until 100) sparseData += Vectors.sparse(20, Seq((9, 0.0))) sparseData += Vectors.sparse(20, Seq((0, 5.0), (9, 13.0), (16, 2.0))) sparseData += Vectors.sparse(20, Seq((3, 5.0), (9, 13.0), (18, 2.0))) @@ -63,8 +63,6 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val (_, sparseTime) = time(dataForSparse.summarizeStatistics()) println(s"dense time is $denseTime, sparse time is $sparseTime.") - assert(relativeTime(denseTime, sparseTime), - "Relative time between dense and sparse vector doesn't match.") } } From 967d041fa806a87a8bdf3bd74fac84a7a6fe7495 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 17:47:12 +0800 Subject: [PATCH 23/38] full revision with Aggregator class --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 155 ++++-------------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 14 +- 2 files changed, 42 insertions(+), 127 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index b5518e4a91a4f..23623e2a28309 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -40,14 +40,12 @@ private class Aggregator( var totalCnt: Double, val nnz: BV[Double], val currMax: BV[Double], - val currMin: BV[Double]) extends VectorRDDStatisticalSummary { - nnz.activeIterator.foreach { - case (id, 0.0) => - currMax(id) = 0.0 - currMin(id) = 0.0 - case _ => + val currMin: BV[Double]) extends VectorRDDStatisticalSummary with Serializable { + + override def mean(): Vector = { + Vectors.fromBreeze(currMean :* nnz :/ totalCnt) } - override def mean(): Vector = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) + override def variance(): Vector = { val deltaMean = currMean val realM2n = currM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) @@ -58,8 +56,23 @@ private class Aggregator( override def totalCount(): Long = totalCnt.toLong override def numNonZeros(): Vector = Vectors.fromBreeze(nnz) - override def max(): Vector = Vectors.fromBreeze(currMax) - override def min(): Vector = Vectors.fromBreeze(currMin) + + override def max(): Vector = { + nnz.activeIterator.foreach { + case (id, 0.0) => currMax(id) = 0.0 + case _ => + } + Vectors.fromBreeze(currMax) + } + + override def min(): Vector = { + nnz.activeIterator.foreach { + case (id, 0.0) => currMin(id) = 0.0 + case _ => + } + Vectors.fromBreeze(currMin) + } + /** * Aggregate function used for aggregating elements in a worker together. */ @@ -75,15 +88,19 @@ private class Aggregator( currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) nnz(id) += 1.0 - totalCnt += 1.0 } + + totalCnt += 1.0 this } + /** * Combine function used for combining intermediate results together from every worker. */ - def merge(other: this.type): this.type = { + def merge(other: Aggregator): this.type = { + totalCnt += other.totalCnt + val deltaMean = currMean - other.currMean other.currMean.activeIterator.foreach { @@ -114,14 +131,6 @@ private class Aggregator( } } -case class VectorRDDStatisticalAggregator( - mean: BV[Double], - statCnt: BV[Double], - totalCnt: Double, - nnz: BV[Double], - currMax: BV[Double], - currMin: BV[Double]) - /** * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use @@ -129,83 +138,13 @@ case class VectorRDDStatisticalAggregator( */ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { - /** - * Aggregate function used for aggregating elements in a worker together. - */ - private def seqOp( - aggregator: VectorRDDStatisticalAggregator, - currData: BV[Double]): VectorRDDStatisticalAggregator = { - aggregator match { - case VectorRDDStatisticalAggregator(prevMean, prevM2n, cnt, nnzVec, maxVec, minVec) => - currData.activeIterator.foreach { - case (id, 0.0) => - case (id, value) => - if (maxVec(id) < value) maxVec(id) = value - if (minVec(id) > value) minVec(id) = value - - val tmpPrevMean = prevMean(id) - prevMean(id) = (prevMean(id) * cnt + value) / (cnt + 1.0) - prevM2n(id) += (value - prevMean(id)) * (value - tmpPrevMean) - - nnzVec(id) += 1.0 - } - - VectorRDDStatisticalAggregator( - prevMean, - prevM2n, - cnt + 1.0, - nnzVec, - maxVec, - minVec) - } - } - - /** - * Combine function used for combining intermediate results together from every worker. - */ - private def combOp( - statistics1: VectorRDDStatisticalAggregator, - statistics2: VectorRDDStatisticalAggregator): VectorRDDStatisticalAggregator = { - (statistics1, statistics2) match { - case (VectorRDDStatisticalAggregator(mean1, m2n1, cnt1, nnz1, max1, min1), - VectorRDDStatisticalAggregator(mean2, m2n2, cnt2, nnz2, max2, min2)) => - val totalCnt = cnt1 + cnt2 - val deltaMean = mean2 - mean1 - - mean2.activeIterator.foreach { - case (id, 0.0) => - case (id, value) => - mean1(id) = (mean1(id) * nnz1(id) + mean2(id) * nnz2(id)) / (nnz1(id) + nnz2(id)) - } - - m2n2.activeIterator.foreach { - case (id, 0.0) => - case (id, value) => - m2n1(id) += - value + deltaMean(id) * deltaMean(id) * nnz1(id) * nnz2(id) / (nnz1(id)+nnz2(id)) - } - - max2.activeIterator.foreach { - case (id, value) => - if (max1(id) < value) max1(id) = value - } - - min2.activeIterator.foreach { - case (id, value) => - if (min1(id) > value) min1(id) = value - } - - axpy(1.0, nnz2, nnz1) - VectorRDDStatisticalAggregator(mean1, m2n1, totalCnt, nnz1, max1, min1) - } - } - /** * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. */ - def summarizeStatistics(): VectorRDDStatisticalAggregator = { + def summarizeStatistics(): VectorRDDStatisticalSummary = { val size = self.take(1).head.size - val zeroValue = VectorRDDStatisticalAggregator( + + val zeroValue = new Aggregator( BV.zeros[Double](size), BV.zeros[Double](size), 0.0, @@ -213,33 +152,9 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { BV.fill(size)(Double.MinValue), BV.fill(size)(Double.MaxValue)) - val VectorRDDStatisticalAggregator(currMean, currM2n, totalCnt, nnz, currMax, currMin) = - self.map(_.toBreeze).aggregate(zeroValue)(seqOp, combOp) - - // solve real mean - val realMean = currMean :* nnz :/ totalCnt - - // solve real m2n - val deltaMean = currMean - val realM2n = currM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) - - // remove the initial value in max and min, i.e. the Double.MaxValue or Double.MinValue. - nnz.activeIterator.foreach { - case (id, 0.0) => - currMax(id) = 0.0 - currMin(id) = 0.0 - case _ => - } - - // get variance - realM2n :/= totalCnt - - VectorRDDStatisticalAggregator( - realMean, - realM2n, - totalCnt, - nnz, - currMax, - currMin) + self.map(_.toBreeze).aggregate[Aggregator](zeroValue)( + (aggregator, data) => aggregator.add(data), + (aggregator1, aggregator2) => aggregator1.merge(aggregator2) + ) } } \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 49cde4b4e11d4..ec76c2279697a 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -44,19 +44,19 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("full-statistics") { val data = sc.parallelize(localData, 2) - val (VectorRDDStatisticalAggregator(mean, variance, cnt, nnz, max, min), denseTime) = + val (summary, denseTime) = time(data.summarizeStatistics()) - assert(equivVector(Vectors.fromBreeze(mean), Vectors.dense(4.0, 5.0, 6.0)), + assert(equivVector(summary.mean(), Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") - assert(equivVector(Vectors.fromBreeze(variance), Vectors.dense(6.0, 6.0, 6.0)), + assert(equivVector(summary.variance(), Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") - assert(cnt === 3.0, "Column cnt do not match.") - assert(equivVector(Vectors.fromBreeze(nnz), Vectors.dense(3.0, 3.0, 3.0)), + assert(summary.totalCount() === 3, "Column cnt do not match.") + assert(equivVector(summary.numNonZeros(), Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") - assert(equivVector(Vectors.fromBreeze(max), Vectors.dense(7.0, 8.0, 9.0)), + assert(equivVector(summary.max(), Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") - assert(equivVector(Vectors.fromBreeze(min), Vectors.dense(1.0, 2.0, 3.0)), + assert(equivVector(summary.min(), Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") val dataForSparse = sc.parallelize(sparseData.toSeq, 2) From f7a3ca25337a5d7ee025133f22e6873c97f3860e Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 18:14:52 +0800 Subject: [PATCH 24/38] fix the corner case of maxmin --- .../org/apache/spark/mllib/rdd/VectorRDDFunctions.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 23623e2a28309..736fc363f2e5d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -59,16 +59,16 @@ private class Aggregator( override def max(): Vector = { nnz.activeIterator.foreach { - case (id, 0.0) => currMax(id) = 0.0 - case _ => + case (id, count) => + if ((count == 0.0) || ((count < totalCnt) && (currMax(id) < 0.0))) currMax(id) = 0.0 } Vectors.fromBreeze(currMax) } override def min(): Vector = { nnz.activeIterator.foreach { - case (id, 0.0) => currMin(id) = 0.0 - case _ => + case (id, count) => + if ((count == 0.0) || ((count < totalCnt) && (currMin(id) > 0.0))) currMin(id) = 0.0 } Vectors.fromBreeze(currMin) } From 18cf07215c1e781e6b96c3986e62ec9e3e9fa788 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 18:23:57 +0800 Subject: [PATCH 25/38] change def to lazy val to make sure that the computations in function be evaluated only once --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 43 +++++++++++-------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 22 ++++++---- 2 files changed, 38 insertions(+), 27 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 736fc363f2e5d..3ddc507a2e601 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.mllib.rdd import breeze.linalg.{axpy, Vector => BV} @@ -26,12 +27,12 @@ import org.apache.spark.rdd.RDD * elements count. */ trait VectorRDDStatisticalSummary { - def mean(): Vector - def variance(): Vector - def totalCount(): Long - def numNonZeros(): Vector - def max(): Vector - def min(): Vector + def mean: Vector + def variance: Vector + def totalCount: Long + def numNonZeros: Vector + def max: Vector + def min: Vector } private class Aggregator( @@ -42,22 +43,24 @@ private class Aggregator( val currMax: BV[Double], val currMin: BV[Double]) extends VectorRDDStatisticalSummary with Serializable { - override def mean(): Vector = { - Vectors.fromBreeze(currMean :* nnz :/ totalCnt) - } + override lazy val mean = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) - override def variance(): Vector = { + override lazy val variance = { val deltaMean = currMean - val realM2n = currM2n - ((deltaMean :* deltaMean) :* (nnz :* (nnz :- totalCnt)) :/ totalCnt) - realM2n :/= totalCnt - Vectors.fromBreeze(realM2n) + var i = 0 + while(i < currM2n.size) { + currM2n(i) -= deltaMean(i) * deltaMean(i) * nnz(i) * (nnz(i)-totalCnt) / totalCnt + currM2n(i) /= totalCnt + i += 1 + } + Vectors.fromBreeze(currM2n) } - override def totalCount(): Long = totalCnt.toLong + override lazy val totalCount: Long = totalCnt.toLong - override def numNonZeros(): Vector = Vectors.fromBreeze(nnz) + override lazy val numNonZeros: Vector = Vectors.fromBreeze(nnz) - override def max(): Vector = { + override lazy val max: Vector = { nnz.activeIterator.foreach { case (id, count) => if ((count == 0.0) || ((count < totalCnt) && (currMax(id) < 0.0))) currMax(id) = 0.0 @@ -65,7 +68,7 @@ private class Aggregator( Vectors.fromBreeze(currMax) } - override def min(): Vector = { + override lazy val min: Vector = { nnz.activeIterator.foreach { case (id, count) => if ((count == 0.0) || ((count < totalCnt) && (currMin(id) > 0.0))) currMin(id) = 0.0 @@ -78,6 +81,7 @@ private class Aggregator( */ def add(currData: BV[Double]): this.type = { currData.activeIterator.foreach { + // this case is used for filtering the zero elements if the vector is a dense one. case (id, 0.0) => case (id, value) => if (currMax(id) < value) currMax(id) = value @@ -106,7 +110,8 @@ private class Aggregator( other.currMean.activeIterator.foreach { case (id, 0.0) => case (id, value) => - currMean(id) = (currMean(id) * nnz(id) + other.currMean(id) * other.nnz(id)) / (nnz(id) + other.nnz(id)) + currMean(id) = + (currMean(id) * nnz(id) + other.currMean(id) * other.nnz(id)) / (nnz(id) + other.nnz(id)) } other.currM2n.activeIterator.foreach { @@ -157,4 +162,4 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { (aggregator1, aggregator2) => aggregator1.merge(aggregator2) ) } -} \ No newline at end of file +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index ec76c2279697a..5eb9d8e2c3da8 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -14,6 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.spark.mllib.rdd import scala.collection.mutable.ArrayBuffer @@ -21,6 +22,7 @@ import scala.collection.mutable.ArrayBuffer import org.scalatest.FunSuite import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.rdd.VectorRDDFunctionsSuite._ import org.apache.spark.mllib.util.LocalSparkContext import org.apache.spark.mllib.util.MLUtils._ @@ -29,7 +31,6 @@ import org.apache.spark.mllib.util.MLUtils._ * between dense and sparse vector are tested. */ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { - import VectorRDDFunctionsSuite._ val localData = Array( Vectors.dense(1.0, 2.0, 3.0), @@ -47,16 +48,21 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val (summary, denseTime) = time(data.summarizeStatistics()) - assert(equivVector(summary.mean(), Vectors.dense(4.0, 5.0, 6.0)), + assert(equivVector(summary.mean, Vectors.dense(4.0, 5.0, 6.0)), "Column mean do not match.") - assert(equivVector(summary.variance(), Vectors.dense(6.0, 6.0, 6.0)), + + assert(equivVector(summary.variance, Vectors.dense(6.0, 6.0, 6.0)), "Column variance do not match.") - assert(summary.totalCount() === 3, "Column cnt do not match.") - assert(equivVector(summary.numNonZeros(), Vectors.dense(3.0, 3.0, 3.0)), + + assert(summary.totalCount === 3, "Column cnt do not match.") + + assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 3.0)), "Column nnz do not match.") - assert(equivVector(summary.max(), Vectors.dense(7.0, 8.0, 9.0)), + + assert(equivVector(summary.max, Vectors.dense(7.0, 8.0, 9.0)), "Column max do not match.") - assert(equivVector(summary.min(), Vectors.dense(1.0, 2.0, 3.0)), + + assert(equivVector(summary.min, Vectors.dense(1.0, 2.0, 3.0)), "Column min do not match.") val dataForSparse = sc.parallelize(sparseData.toSeq, 2) @@ -82,4 +88,4 @@ object VectorRDDFunctionsSuite { val denominator = math.max(lhs, rhs) math.abs(lhs - rhs) / denominator < 0.3 } -} \ No newline at end of file +} From dc77e38f41f52d7f2fc1ee162c827c6f2fb13442 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 21:30:12 +0800 Subject: [PATCH 26/38] test sparse vector RDD --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 21 ++++--- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 57 ++++++++++--------- 2 files changed, 43 insertions(+), 35 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 3ddc507a2e601..57f4eec312cb7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -49,7 +49,7 @@ private class Aggregator( val deltaMean = currMean var i = 0 while(i < currM2n.size) { - currM2n(i) -= deltaMean(i) * deltaMean(i) * nnz(i) * (nnz(i)-totalCnt) / totalCnt + currM2n(i) += deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt-nnz(i)) / totalCnt currM2n(i) /= totalCnt i += 1 } @@ -61,7 +61,7 @@ private class Aggregator( override lazy val numNonZeros: Vector = Vectors.fromBreeze(nnz) override lazy val max: Vector = { - nnz.activeIterator.foreach { + nnz.iterator.foreach { case (id, count) => if ((count == 0.0) || ((count < totalCnt) && (currMax(id) < 0.0))) currMax(id) = 0.0 } @@ -69,7 +69,7 @@ private class Aggregator( } override lazy val min: Vector = { - nnz.activeIterator.foreach { + nnz.iterator.foreach { case (id, count) => if ((count == 0.0) || ((count < totalCnt) && (currMin(id) > 0.0))) currMin(id) = 0.0 } @@ -88,7 +88,7 @@ private class Aggregator( if (currMin(id) > value) currMin(id) = value val tmpPrevMean = currMean(id) - currMean(id) = (currMean(id) * totalCnt + value) / (totalCnt + 1.0) + currMean(id) = (currMean(id) * nnz(id) + value) / (nnz(id) + 1.0) currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) nnz(id) += 1.0 @@ -114,11 +114,14 @@ private class Aggregator( (currMean(id) * nnz(id) + other.currMean(id) * other.nnz(id)) / (nnz(id) + other.nnz(id)) } - other.currM2n.activeIterator.foreach { - case (id, 0.0) => - case (id, value) => - currM2n(id) += - value + deltaMean(id) * deltaMean(id) * nnz(id) * other.nnz(id) / (nnz(id)+other.nnz(id)) + var i = 0 + while(i < currM2n.size) { + (nnz(i), other.nnz(i)) match { + case (0.0, 0.0) => + case _ => currM2n(i) += + other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / (nnz(i)+other.nnz(i)) + } + i += 1 } other.currMax.activeIterator.foreach { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 5eb9d8e2c3da8..b621bf79b6e8b 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -38,54 +38,59 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { Vectors.dense(7.0, 8.0, 9.0) ) - val sparseData = ArrayBuffer(Vectors.sparse(20, Seq((0, 1.0), (9, 2.0), (10, 7.0)))) - for (i <- 0 until 100) sparseData += Vectors.sparse(20, Seq((9, 0.0))) - sparseData += Vectors.sparse(20, Seq((0, 5.0), (9, 13.0), (16, 2.0))) - sparseData += Vectors.sparse(20, Seq((3, 5.0), (9, 13.0), (18, 2.0))) + val sparseData = ArrayBuffer(Vectors.sparse(3, Seq((0, 1.0)))) + for (i <- 0 until 97) sparseData += Vectors.sparse(3, Seq((2, 0.0))) + sparseData += Vectors.sparse(3, Seq((0, 5.0))) + sparseData += Vectors.sparse(3, Seq((1, 5.0))) - test("full-statistics") { + test("dense statistical summary") { val data = sc.parallelize(localData, 2) - val (summary, denseTime) = - time(data.summarizeStatistics()) + val summary = data.summarizeStatistics() assert(equivVector(summary.mean, Vectors.dense(4.0, 5.0, 6.0)), - "Column mean do not match.") + "Dense column mean do not match.") assert(equivVector(summary.variance, Vectors.dense(6.0, 6.0, 6.0)), - "Column variance do not match.") + "Dense column variance do not match.") - assert(summary.totalCount === 3, "Column cnt do not match.") + assert(summary.totalCount === 3, "Dense column cnt do not match.") assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 3.0)), - "Column nnz do not match.") + "Dense column nnz do not match.") assert(equivVector(summary.max, Vectors.dense(7.0, 8.0, 9.0)), - "Column max do not match.") + "Dense column max do not match.") assert(equivVector(summary.min, Vectors.dense(1.0, 2.0, 3.0)), - "Column min do not match.") + "Dense column min do not match.") + } + test("sparse statistical summary") { val dataForSparse = sc.parallelize(sparseData.toSeq, 2) - val (_, sparseTime) = time(dataForSparse.summarizeStatistics()) + val summary = dataForSparse.summarizeStatistics() + + assert(equivVector(summary.mean, Vectors.dense(0.06, 0.05, 0.0)), + "Sparse column mean do not match.") + + assert(equivVector(summary.variance, Vectors.dense(0.2564, 0.2475, 0.0)), + "Sparse column variance do not match.") + + assert(summary.totalCount === 100, "Sparse column cnt do not match.") + + assert(equivVector(summary.numNonZeros, Vectors.dense(2.0, 1.0, 0.0)), + "Sparse column nnz do not match.") - println(s"dense time is $denseTime, sparse time is $sparseTime.") + assert(equivVector(summary.max, Vectors.dense(5.0, 5.0, 0.0)), + "Sparse column max do not match.") + + assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 0.0)), + "Sparse column min do not match.") } } object VectorRDDFunctionsSuite { - def time[R](block: => R): (R, Double) = { - val t0 = System.nanoTime() - val result = block - val t1 = System.nanoTime() - (result, (t1 - t0).toDouble / 1.0e9) - } def equivVector(lhs: Vector, rhs: Vector): Boolean = { (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 } - - def relativeTime(lhs: Double, rhs: Double): Boolean = { - val denominator = math.max(lhs, rhs) - math.abs(lhs - rhs) / denominator < 0.3 - } } From 86522c442a494b7520a962ebb6b3a6aac8896c38 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 2 Apr 2014 23:41:36 +0800 Subject: [PATCH 27/38] add comments on functions --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 57f4eec312cb7..fcb59c571e4f8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -23,8 +23,8 @@ import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.rdd.RDD /** - * Case class of the summary statistics, including mean, variance, count, max, min, and non-zero - * elements count. + * Trait of the summary statistics, including mean, variance, count, max, min, and non-zero elements + * count. */ trait VectorRDDStatisticalSummary { def mean: Vector @@ -35,6 +35,10 @@ trait VectorRDDStatisticalSummary { def min: Vector } +/** + * Aggregates [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary]] + * together with add() and merge() function. + */ private class Aggregator( val currMean: BV[Double], val currM2n: BV[Double], @@ -43,8 +47,15 @@ private class Aggregator( val currMax: BV[Double], val currMin: BV[Double]) extends VectorRDDStatisticalSummary with Serializable { + // lazy val is used for computing only once time. Same below. override lazy val mean = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) + // Online variance solution used in add() function, while parallel variance solution used in + // merge() function. Reference here: + // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance + // Solution here ignoring the zero elements when calling add() and merge(), for decreasing the + // O(n) algorithm to O(nnz). Real variance is computed here after we get other statistics, simply + // by another parallel combination process. override lazy val variance = { val deltaMean = currMean var i = 0 From 548e9de33291436a459d6ae9f5dc0163bbfbf867 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 3 Apr 2014 12:43:16 +0800 Subject: [PATCH 28/38] minor revision --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 63 +++++++++---------- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 8 +-- 2 files changed, 35 insertions(+), 36 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index fcb59c571e4f8..fcb5a5f18b127 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -17,7 +17,7 @@ package org.apache.spark.mllib.rdd -import breeze.linalg.{axpy, Vector => BV} +import breeze.linalg.{Vector => BV, DenseVector => BDV} import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.rdd.RDD @@ -29,7 +29,7 @@ import org.apache.spark.rdd.RDD trait VectorRDDStatisticalSummary { def mean: Vector def variance: Vector - def totalCount: Long + def count: Long def numNonZeros: Vector def max: Vector def min: Vector @@ -37,44 +37,43 @@ trait VectorRDDStatisticalSummary { /** * Aggregates [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary]] - * together with add() and merge() function. + * together with add() and merge() function. Online variance solution used in add() function, while + * parallel variance solution used in merge() function. Reference here: + * [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]. Solution here + * ignoring the zero elements when calling add() and merge(), for decreasing the O(n) algorithm to + * O(nnz). Real variance is computed here after we get other statistics, simply by another parallel + * combination process. */ -private class Aggregator( - val currMean: BV[Double], - val currM2n: BV[Double], +private class VectorRDDStatisticsAggregator( + val currMean: BDV[Double], + val currM2n: BDV[Double], var totalCnt: Double, - val nnz: BV[Double], - val currMax: BV[Double], - val currMin: BV[Double]) extends VectorRDDStatisticalSummary with Serializable { + val nnz: BDV[Double], + val currMax: BDV[Double], + val currMin: BDV[Double]) extends VectorRDDStatisticalSummary with Serializable { // lazy val is used for computing only once time. Same below. override lazy val mean = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) - // Online variance solution used in add() function, while parallel variance solution used in - // merge() function. Reference here: - // http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance - // Solution here ignoring the zero elements when calling add() and merge(), for decreasing the - // O(n) algorithm to O(nnz). Real variance is computed here after we get other statistics, simply - // by another parallel combination process. override lazy val variance = { val deltaMean = currMean var i = 0 - while(i < currM2n.size) { - currM2n(i) += deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt-nnz(i)) / totalCnt + while (i < currM2n.size) { + currM2n(i) += deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt currM2n(i) /= totalCnt i += 1 } Vectors.fromBreeze(currM2n) } - override lazy val totalCount: Long = totalCnt.toLong + override lazy val count: Long = totalCnt.toLong override lazy val numNonZeros: Vector = Vectors.fromBreeze(nnz) override lazy val max: Vector = { nnz.iterator.foreach { case (id, count) => - if ((count == 0.0) || ((count < totalCnt) && (currMax(id) < 0.0))) currMax(id) = 0.0 + if ((count < totalCnt) && (currMax(id) < 0.0)) currMax(id) = 0.0 } Vectors.fromBreeze(currMax) } @@ -82,7 +81,7 @@ private class Aggregator( override lazy val min: Vector = { nnz.iterator.foreach { case (id, count) => - if ((count == 0.0) || ((count < totalCnt) && (currMin(id) > 0.0))) currMin(id) = 0.0 + if ((count < totalCnt) && (currMin(id) > 0.0)) currMin(id) = 0.0 } Vectors.fromBreeze(currMin) } @@ -92,7 +91,7 @@ private class Aggregator( */ def add(currData: BV[Double]): this.type = { currData.activeIterator.foreach { - // this case is used for filtering the zero elements if the vector is a dense one. + // this case is used for filtering the zero elements if the vector. case (id, 0.0) => case (id, value) => if (currMax(id) < value) currMax(id) = value @@ -112,7 +111,7 @@ private class Aggregator( /** * Combine function used for combining intermediate results together from every worker. */ - def merge(other: Aggregator): this.type = { + def merge(other: VectorRDDStatisticsAggregator): this.type = { totalCnt += other.totalCnt @@ -145,7 +144,7 @@ private class Aggregator( if (currMin(id) > value) currMin(id) = value } - axpy(1.0, other.nnz, nnz) + nnz += other.nnz this } } @@ -160,18 +159,18 @@ class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { /** * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. */ - def summarizeStatistics(): VectorRDDStatisticalSummary = { - val size = self.take(1).head.size + def computeSummaryStatistics(): VectorRDDStatisticalSummary = { + val size = self.first().size - val zeroValue = new Aggregator( - BV.zeros[Double](size), - BV.zeros[Double](size), + val zeroValue = new VectorRDDStatisticsAggregator( + BDV.zeros[Double](size), + BDV.zeros[Double](size), 0.0, - BV.zeros[Double](size), - BV.fill(size)(Double.MinValue), - BV.fill(size)(Double.MaxValue)) + BDV.zeros[Double](size), + BDV.fill(size)(Double.MinValue), + BDV.fill(size)(Double.MaxValue)) - self.map(_.toBreeze).aggregate[Aggregator](zeroValue)( + self.map(_.toBreeze).aggregate[VectorRDDStatisticsAggregator](zeroValue)( (aggregator, data) => aggregator.add(data), (aggregator1, aggregator2) => aggregator1.merge(aggregator2) ) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index b621bf79b6e8b..87cfd6c8c436c 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -45,7 +45,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("dense statistical summary") { val data = sc.parallelize(localData, 2) - val summary = data.summarizeStatistics() + val summary = data.computeSummaryStatistics() assert(equivVector(summary.mean, Vectors.dense(4.0, 5.0, 6.0)), "Dense column mean do not match.") @@ -53,7 +53,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { assert(equivVector(summary.variance, Vectors.dense(6.0, 6.0, 6.0)), "Dense column variance do not match.") - assert(summary.totalCount === 3, "Dense column cnt do not match.") + assert(summary.count === 3, "Dense column cnt do not match.") assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 3.0)), "Dense column nnz do not match.") @@ -67,7 +67,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { test("sparse statistical summary") { val dataForSparse = sc.parallelize(sparseData.toSeq, 2) - val summary = dataForSparse.summarizeStatistics() + val summary = dataForSparse.computeSummaryStatistics() assert(equivVector(summary.mean, Vectors.dense(0.06, 0.05, 0.0)), "Sparse column mean do not match.") @@ -75,7 +75,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { assert(equivVector(summary.variance, Vectors.dense(0.2564, 0.2475, 0.0)), "Sparse column variance do not match.") - assert(summary.totalCount === 100, "Sparse column cnt do not match.") + assert(summary.count === 100, "Sparse column cnt do not match.") assert(equivVector(summary.numNonZeros, Vectors.dense(2.0, 1.0, 0.0)), "Sparse column nnz do not match.") From 69e1f37cad8875ea73ecdd8f58abe6e8bd87bf09 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 3 Apr 2014 13:05:25 +0800 Subject: [PATCH 29/38] remove lazy eval, and minor memory footprint --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 70 +++++++++++-------- 1 file changed, 42 insertions(+), 28 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index fcb5a5f18b127..ee4f940b1a069 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -53,35 +53,46 @@ private class VectorRDDStatisticsAggregator( val currMin: BDV[Double]) extends VectorRDDStatisticalSummary with Serializable { // lazy val is used for computing only once time. Same below. - override lazy val mean = Vectors.fromBreeze(currMean :* nnz :/ totalCnt) + override def mean = { + val realMean = BDV.zeros[Double](currMean.length) + var i = 0 + while (i < currMean.length) { + realMean(i) = currMean(i) * nnz(i) / totalCnt + i += 1 + } + Vectors.fromBreeze(realMean) + } - override lazy val variance = { + override def variance = { + val realVariance = BDV.zeros[Double](currM2n.length) val deltaMean = currMean var i = 0 while (i < currM2n.size) { - currM2n(i) += deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt - currM2n(i) /= totalCnt + realVariance(i) = currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt + realVariance(i) /= totalCnt i += 1 } - Vectors.fromBreeze(currM2n) + Vectors.fromBreeze(realVariance) } - override lazy val count: Long = totalCnt.toLong + override def count: Long = totalCnt.toLong - override lazy val numNonZeros: Vector = Vectors.fromBreeze(nnz) + override def numNonZeros: Vector = Vectors.fromBreeze(nnz) - override lazy val max: Vector = { - nnz.iterator.foreach { - case (id, count) => - if ((count < totalCnt) && (currMax(id) < 0.0)) currMax(id) = 0.0 + override def max: Vector = { + var i = 0 + while (i < nnz.length) { + if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 + i += 1 } Vectors.fromBreeze(currMax) } - override lazy val min: Vector = { - nnz.iterator.foreach { - case (id, count) => - if ((count < totalCnt) && (currMin(id) > 0.0)) currMin(id) = 0.0 + override def min: Vector = { + var i = 0 + while (i < nnz.length) { + if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0 + i += 1 } Vectors.fromBreeze(currMin) } @@ -117,15 +128,16 @@ private class VectorRDDStatisticsAggregator( val deltaMean = currMean - other.currMean - other.currMean.activeIterator.foreach { - case (id, 0.0) => - case (id, value) => - currMean(id) = - (currMean(id) * nnz(id) + other.currMean(id) * other.nnz(id)) / (nnz(id) + other.nnz(id)) + var i = 0 + while (i < other.currMean.length) { + if (other.currMean(i) != 0.0) + currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / + (nnz(i) + other.nnz(i)) + i += 1 } - var i = 0 - while(i < currM2n.size) { + i = 0 + while (i < currM2n.size) { (nnz(i), other.nnz(i)) match { case (0.0, 0.0) => case _ => currM2n(i) += @@ -134,14 +146,16 @@ private class VectorRDDStatisticsAggregator( i += 1 } - other.currMax.activeIterator.foreach { - case (id, value) => - if (currMax(id) < value) currMax(id) = value + i = 0 + while (i < other.currMax.length) { + if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) + i += 1 } - other.currMin.activeIterator.foreach { - case (id, value) => - if (currMin(id) > value) currMin(id) = value + i = 0 + while (i < other.currMin.length) { + if (currMin(i) > other.currMin(i)) currMin(i) = other.currMin(i) + i += 1 } nnz += other.nnz From 1fba230898c912c3f59940b4f182d53dcb94365e Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 3 Apr 2014 13:37:58 +0800 Subject: [PATCH 30/38] merge while loop together --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 28 +++++++------------ .../mllib/rdd/VectorRDDFunctionsSuite.scala | 1 - 2 files changed, 10 insertions(+), 19 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index ee4f940b1a069..1dce0acefba82 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -50,7 +50,8 @@ private class VectorRDDStatisticsAggregator( var totalCnt: Double, val nnz: BDV[Double], val currMax: BDV[Double], - val currMin: BDV[Double]) extends VectorRDDStatisticalSummary with Serializable { + val currMin: BDV[Double]) + extends VectorRDDStatisticalSummary with Serializable { // lazy val is used for computing only once time. Same below. override def mean = { @@ -130,31 +131,22 @@ private class VectorRDDStatisticsAggregator( var i = 0 while (i < other.currMean.length) { - if (other.currMean(i) != 0.0) + // merge mean together + if (other.currMean(i) != 0.0) { currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / (nnz(i) + other.nnz(i)) - i += 1 - } + } - i = 0 - while (i < currM2n.size) { - (nnz(i), other.nnz(i)) match { - case (0.0, 0.0) => - case _ => currM2n(i) += - other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / (nnz(i)+other.nnz(i)) + // merge m2n together + if (nnz(i) + other.nnz(i) != 0.0) { + currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / + (nnz(i)+other.nnz(i)) } - i += 1 - } - i = 0 - while (i < other.currMax.length) { if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) - i += 1 - } - i = 0 - while (i < other.currMin.length) { if (currMin(i) > other.currMin(i)) currMin(i) = other.currMin(i) + i += 1 } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index 87cfd6c8c436c..bf1b3693cfbf0 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -89,7 +89,6 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { } object VectorRDDFunctionsSuite { - def equivVector(lhs: Vector, rhs: Vector): Boolean = { (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 } From e624f93a3c49d8d647f537ab9dcad14e10bade8c Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 3 Apr 2014 16:44:30 +0800 Subject: [PATCH 31/38] fix scala style error --- .../org/apache/spark/mllib/rdd/VectorRDDFunctions.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 1dce0acefba82..6e6dd242d3853 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -69,7 +69,8 @@ private class VectorRDDStatisticsAggregator( val deltaMean = currMean var i = 0 while (i < currM2n.size) { - realVariance(i) = currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt + realVariance(i) = + currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt realVariance(i) /= totalCnt i += 1 } @@ -140,7 +141,7 @@ private class VectorRDDStatisticsAggregator( // merge m2n together if (nnz(i) + other.nnz(i) != 0.0) { currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / - (nnz(i)+other.nnz(i)) + (nnz(i) + other.nnz(i)) } if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) From 48ee053b86210d0d2ff03c13a0c4187d962c0a5d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 4 Apr 2014 09:52:23 +0800 Subject: [PATCH 32/38] fix minor error --- .../spark/mllib/rdd/VectorRDDFunctions.scala | 27 +++++++++++++++++-- .../mllib/rdd/VectorRDDFunctionsSuite.scala | 18 ++++++------- 2 files changed, 34 insertions(+), 11 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala index 6e6dd242d3853..0b677d9c4fdef 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala @@ -27,11 +27,35 @@ import org.apache.spark.rdd.RDD * count. */ trait VectorRDDStatisticalSummary { + + /** + * Computes the mean of columns in RDD[Vector]. + */ def mean: Vector + + /** + * Computes the sample variance of columns in RDD[Vector]. + */ def variance: Vector + + /** + * Computes number of vectors in RDD[Vector]. + */ def count: Long + + /** + * Computes the number of non-zero elements in each column of RDD[Vector]. + */ def numNonZeros: Vector + + /** + * Computes the maximum of each column in RDD[Vector]. + */ def max: Vector + + /** + * Computes the minimum of each column in RDD[Vector]. + */ def min: Vector } @@ -53,7 +77,6 @@ private class VectorRDDStatisticsAggregator( val currMin: BDV[Double]) extends VectorRDDStatisticalSummary with Serializable { - // lazy val is used for computing only once time. Same below. override def mean = { val realMean = BDV.zeros[Double](currMean.length) var i = 0 @@ -71,7 +94,7 @@ private class VectorRDDStatisticsAggregator( while (i < currM2n.size) { realVariance(i) = currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt - realVariance(i) /= totalCnt + realVariance(i) /= (totalCnt - 1.0) i += 1 } Vectors.fromBreeze(realVariance) diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala index bf1b3693cfbf0..9bf92d54429a4 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala @@ -34,8 +34,8 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val localData = Array( Vectors.dense(1.0, 2.0, 3.0), - Vectors.dense(4.0, 5.0, 6.0), - Vectors.dense(7.0, 8.0, 9.0) + Vectors.dense(4.0, 0.0, 6.0), + Vectors.dense(0.0, 8.0, 9.0) ) val sparseData = ArrayBuffer(Vectors.sparse(3, Seq((0, 1.0)))) @@ -47,21 +47,21 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { val data = sc.parallelize(localData, 2) val summary = data.computeSummaryStatistics() - assert(equivVector(summary.mean, Vectors.dense(4.0, 5.0, 6.0)), + assert(equivVector(summary.mean, Vectors.dense(5.0 / 3.0, 10.0 / 3.0, 6.0)), "Dense column mean do not match.") - assert(equivVector(summary.variance, Vectors.dense(6.0, 6.0, 6.0)), + assert(equivVector(summary.variance, Vectors.dense(4.333333333333334, 17.333333333333336, 9.0)), "Dense column variance do not match.") assert(summary.count === 3, "Dense column cnt do not match.") - assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 3.0)), + assert(equivVector(summary.numNonZeros, Vectors.dense(2.0, 2.0, 3.0)), "Dense column nnz do not match.") - assert(equivVector(summary.max, Vectors.dense(7.0, 8.0, 9.0)), + assert(equivVector(summary.max, Vectors.dense(4.0, 8.0, 9.0)), "Dense column max do not match.") - assert(equivVector(summary.min, Vectors.dense(1.0, 2.0, 3.0)), + assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 3.0)), "Dense column min do not match.") } @@ -72,7 +72,7 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { assert(equivVector(summary.mean, Vectors.dense(0.06, 0.05, 0.0)), "Sparse column mean do not match.") - assert(equivVector(summary.variance, Vectors.dense(0.2564, 0.2475, 0.0)), + assert(equivVector(summary.variance, Vectors.dense(0.258989898989899, 0.25, 0.0)), "Sparse column variance do not match.") assert(summary.count === 100, "Sparse column cnt do not match.") @@ -90,6 +90,6 @@ class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { object VectorRDDFunctionsSuite { def equivVector(lhs: Vector, rhs: Vector): Boolean = { - (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-9 + (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-5 } } From 4eaf28af25f981f147a3bfc95dcc08ab883497b6 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Wed, 9 Apr 2014 19:28:16 +0800 Subject: [PATCH 33/38] merge VectorRDDStatistics into RowMatrix --- .../mllib/linalg/distributed/RowMatrix.scala | 186 +++++++++++++++- .../spark/mllib/rdd/VectorRDDFunctions.scala | 208 ------------------ .../org/apache/spark/mllib/util/MLUtils.scala | 1 - .../linalg/distributed/RowMatrixSuite.scala | 45 ++++ .../mllib/rdd/VectorRDDFunctionsSuite.scala | 95 -------- 5 files changed, 229 insertions(+), 306 deletions(-) delete mode 100644 mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala delete mode 100644 mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index f65f43dd3007b..d970c3db16bf2 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -19,7 +19,7 @@ package org.apache.spark.mllib.linalg.distributed import java.util -import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, svd => brzSvd} +import breeze.linalg.{Vector => BV, DenseMatrix => BDM, DenseVector => BDV, svd => brzSvd} import breeze.numerics.{sqrt => brzSqrt} import com.github.fommil.netlib.BLAS.{getInstance => blas} @@ -29,7 +29,171 @@ import org.apache.spark.rdd.RDD import org.apache.spark.Logging /** - * :: Experimental :: + * Trait of the summary statistics, including mean, variance, count, max, min, and non-zero elements + * count. + */ +trait VectorRDDStatisticalSummary { + + /** + * Computes the mean of columns in RDD[Vector]. + */ + def mean: Vector + + /** + * Computes the sample variance of columns in RDD[Vector]. + */ + def variance: Vector + + /** + * Computes number of vectors in RDD[Vector]. + */ + def count: Long + + /** + * Computes the number of non-zero elements in each column of RDD[Vector]. + */ + def numNonZeros: Vector + + /** + * Computes the maximum of each column in RDD[Vector]. + */ + def max: Vector + + /** + * Computes the minimum of each column in RDD[Vector]. + */ + def min: Vector +} + + +/** + * Aggregates [[org.apache.spark.mllib.linalg.distributed.VectorRDDStatisticalSummary + * VectorRDDStatisticalSummary]] together with add() and merge() function. Online variance solution + * used in add() function, while parallel variance solution used in merge() function. Reference here + * : [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]. Solution + * here ignoring the zero elements when calling add() and merge(), for decreasing the O(n) algorithm + * to O(nnz). Real variance is computed here after we get other statistics, simply by another + * parallel combination process. + */ +private class VectorRDDStatisticsAggregator( + val currMean: BDV[Double], + val currM2n: BDV[Double], + var totalCnt: Double, + val nnz: BDV[Double], + val currMax: BDV[Double], + val currMin: BDV[Double]) + extends VectorRDDStatisticalSummary with Serializable { + + override def mean = { + val realMean = BDV.zeros[Double](currMean.length) + var i = 0 + while (i < currMean.length) { + realMean(i) = currMean(i) * nnz(i) / totalCnt + i += 1 + } + Vectors.fromBreeze(realMean) + } + + override def variance = { + val realVariance = BDV.zeros[Double](currM2n.length) + + val denominator = totalCnt - 1.0 + + // Sample variance is computed, if the denominator is 0, the variance is just 0. + if (denominator != 0.0) { + val deltaMean = currMean + var i = 0 + while (i < currM2n.size) { + realVariance(i) = + currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt + realVariance(i) /= denominator + i += 1 + } + } + + Vectors.fromBreeze(realVariance) + } + + override def count: Long = totalCnt.toLong + + override def numNonZeros: Vector = Vectors.fromBreeze(nnz) + + override def max: Vector = { + var i = 0 + while (i < nnz.length) { + if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 + i += 1 + } + Vectors.fromBreeze(currMax) + } + + override def min: Vector = { + var i = 0 + while (i < nnz.length) { + if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0 + i += 1 + } + Vectors.fromBreeze(currMin) + } + + /** + * Aggregate function used for aggregating elements in a worker together. + */ + def add(currData: BV[Double]): this.type = { + currData.activeIterator.foreach { + // this case is used for filtering the zero elements if the vector. + case (id, 0.0) => + case (id, value) => + if (currMax(id) < value) currMax(id) = value + if (currMin(id) > value) currMin(id) = value + + val tmpPrevMean = currMean(id) + currMean(id) = (currMean(id) * nnz(id) + value) / (nnz(id) + 1.0) + currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) + + nnz(id) += 1.0 + } + + totalCnt += 1.0 + this + } + + /** + * Combine function used for combining intermediate results together from every worker. + */ + def merge(other: VectorRDDStatisticsAggregator): this.type = { + + totalCnt += other.totalCnt + + val deltaMean = currMean - other.currMean + + var i = 0 + while (i < other.currMean.length) { + // merge mean together + if (other.currMean(i) != 0.0) { + currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / + (nnz(i) + other.nnz(i)) + } + + // merge m2n together + if (nnz(i) + other.nnz(i) != 0.0) { + currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / + (nnz(i) + other.nnz(i)) + } + + if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) + + if (currMin(i) > other.currMin(i)) currMin(i) = other.currMin(i) + + i += 1 + } + + nnz += other.nnz + this + } +} + +/** * Represents a row-oriented distributed Matrix with no meaningful row indices. * * @param rows rows stored as an RDD[Vector] @@ -240,6 +404,24 @@ class RowMatrix( } } + /** + * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. + */ + def multiVariateSummaryStatistics(): VectorRDDStatisticalSummary = { + val zeroValue = new VectorRDDStatisticsAggregator( + BDV.zeros[Double](nCols), + BDV.zeros[Double](nCols), + 0.0, + BDV.zeros[Double](nCols), + BDV.fill(nCols)(Double.MinValue), + BDV.fill(nCols)(Double.MaxValue)) + + rows.map(_.toBreeze).aggregate[VectorRDDStatisticsAggregator](zeroValue)( + (aggregator, data) => aggregator.add(data), + (aggregator1, aggregator2) => aggregator1.merge(aggregator2) + ) + } + /** * Multiply this matrix by a local matrix on the right. * diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala deleted file mode 100644 index 0b677d9c4fdef..0000000000000 --- a/mllib/src/main/scala/org/apache/spark/mllib/rdd/VectorRDDFunctions.scala +++ /dev/null @@ -1,208 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.mllib.rdd - -import breeze.linalg.{Vector => BV, DenseVector => BDV} - -import org.apache.spark.mllib.linalg.{Vectors, Vector} -import org.apache.spark.rdd.RDD - -/** - * Trait of the summary statistics, including mean, variance, count, max, min, and non-zero elements - * count. - */ -trait VectorRDDStatisticalSummary { - - /** - * Computes the mean of columns in RDD[Vector]. - */ - def mean: Vector - - /** - * Computes the sample variance of columns in RDD[Vector]. - */ - def variance: Vector - - /** - * Computes number of vectors in RDD[Vector]. - */ - def count: Long - - /** - * Computes the number of non-zero elements in each column of RDD[Vector]. - */ - def numNonZeros: Vector - - /** - * Computes the maximum of each column in RDD[Vector]. - */ - def max: Vector - - /** - * Computes the minimum of each column in RDD[Vector]. - */ - def min: Vector -} - -/** - * Aggregates [[org.apache.spark.mllib.rdd.VectorRDDStatisticalSummary VectorRDDStatisticalSummary]] - * together with add() and merge() function. Online variance solution used in add() function, while - * parallel variance solution used in merge() function. Reference here: - * [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]. Solution here - * ignoring the zero elements when calling add() and merge(), for decreasing the O(n) algorithm to - * O(nnz). Real variance is computed here after we get other statistics, simply by another parallel - * combination process. - */ -private class VectorRDDStatisticsAggregator( - val currMean: BDV[Double], - val currM2n: BDV[Double], - var totalCnt: Double, - val nnz: BDV[Double], - val currMax: BDV[Double], - val currMin: BDV[Double]) - extends VectorRDDStatisticalSummary with Serializable { - - override def mean = { - val realMean = BDV.zeros[Double](currMean.length) - var i = 0 - while (i < currMean.length) { - realMean(i) = currMean(i) * nnz(i) / totalCnt - i += 1 - } - Vectors.fromBreeze(realMean) - } - - override def variance = { - val realVariance = BDV.zeros[Double](currM2n.length) - val deltaMean = currMean - var i = 0 - while (i < currM2n.size) { - realVariance(i) = - currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * (totalCnt - nnz(i)) / totalCnt - realVariance(i) /= (totalCnt - 1.0) - i += 1 - } - Vectors.fromBreeze(realVariance) - } - - override def count: Long = totalCnt.toLong - - override def numNonZeros: Vector = Vectors.fromBreeze(nnz) - - override def max: Vector = { - var i = 0 - while (i < nnz.length) { - if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 - i += 1 - } - Vectors.fromBreeze(currMax) - } - - override def min: Vector = { - var i = 0 - while (i < nnz.length) { - if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0 - i += 1 - } - Vectors.fromBreeze(currMin) - } - - /** - * Aggregate function used for aggregating elements in a worker together. - */ - def add(currData: BV[Double]): this.type = { - currData.activeIterator.foreach { - // this case is used for filtering the zero elements if the vector. - case (id, 0.0) => - case (id, value) => - if (currMax(id) < value) currMax(id) = value - if (currMin(id) > value) currMin(id) = value - - val tmpPrevMean = currMean(id) - currMean(id) = (currMean(id) * nnz(id) + value) / (nnz(id) + 1.0) - currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) - - nnz(id) += 1.0 - } - - totalCnt += 1.0 - this - } - - /** - * Combine function used for combining intermediate results together from every worker. - */ - def merge(other: VectorRDDStatisticsAggregator): this.type = { - - totalCnt += other.totalCnt - - val deltaMean = currMean - other.currMean - - var i = 0 - while (i < other.currMean.length) { - // merge mean together - if (other.currMean(i) != 0.0) { - currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / - (nnz(i) + other.nnz(i)) - } - - // merge m2n together - if (nnz(i) + other.nnz(i) != 0.0) { - currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / - (nnz(i) + other.nnz(i)) - } - - if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) - - if (currMin(i) > other.currMin(i)) currMin(i) = other.currMin(i) - - i += 1 - } - - nnz += other.nnz - this - } -} - -/** - * Extra functions available on RDDs of [[org.apache.spark.mllib.linalg.Vector Vector]] through an - * implicit conversion. Import `org.apache.spark.MLContext._` at the top of your program to use - * these functions. - */ -class VectorRDDFunctions(self: RDD[Vector]) extends Serializable { - - /** - * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. - */ - def computeSummaryStatistics(): VectorRDDStatisticalSummary = { - val size = self.first().size - - val zeroValue = new VectorRDDStatisticsAggregator( - BDV.zeros[Double](size), - BDV.zeros[Double](size), - 0.0, - BDV.zeros[Double](size), - BDV.fill(size)(Double.MinValue), - BDV.fill(size)(Double.MaxValue)) - - self.map(_.toBreeze).aggregate[VectorRDDStatisticsAggregator](zeroValue)( - (aggregator, data) => aggregator.add(data), - (aggregator1, aggregator2) => aggregator1.merge(aggregator2) - ) - } -} diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 2bc3ab97ca2fc..ac2360c429e2b 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -265,5 +265,4 @@ object MLUtils { } sqDist } - implicit def rddToVectorRDDFunctions(rdd: RDD[Vector]) = new VectorRDDFunctions(rdd) } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 71ee8e8a4f6fd..19c8a7730cb09 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -137,6 +137,9 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { brzNorm(v, 1.0) < 1e-6 } + def equivVector(lhs: Vector, rhs: Vector): Boolean = + closeToZero(lhs.toBreeze.asInstanceOf[BDV[Double]] - rhs.toBreeze.asInstanceOf[BDV[Double]]) + def assertColumnEqualUpToSign(A: BDM[Double], B: BDM[Double], k: Int) { assert(A.rows === B.rows) for (j <- 0 until k) { @@ -170,4 +173,46 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { )) } } + + test("dense statistical summary") { + val summary = denseMat.multiVariateSummaryStatistics() + + assert(equivVector(summary.mean, Vectors.dense(4.5, 3.0, 4.0)), + "Dense column mean do not match.") + + assert(equivVector(summary.variance, Vectors.dense(15.0, 10.0, 10.0)), + "Dense column variance do not match.") + + assert(summary.count === 4, "Dense column cnt do not match.") + + assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 4.0)), + "Dense column nnz do not match.") + + assert(equivVector(summary.max, Vectors.dense(9.0, 7.0, 8.0)), + "Dense column max do not match.") + + assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 1.0)), + "Dense column min do not match.") + } + + test("sparse statistical summary") { + val summary = sparseMat.multiVariateSummaryStatistics() + + assert(equivVector(summary.mean, Vectors.dense(4.5, 3.0, 4.0)), + "Sparse column mean do not match.") + + assert(equivVector(summary.variance, Vectors.dense(15.0, 10.0, 10.0)), + "Sparse column variance do not match.") + + assert(summary.count === 4, "Sparse column cnt do not match.") + + assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 4.0)), + "Sparse column nnz do not match.") + + assert(equivVector(summary.max, Vectors.dense(9.0, 7.0, 8.0)), + "Sparse column max do not match.") + + assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 1.0)), + "Sparse column min do not match.") + } } diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala deleted file mode 100644 index 9bf92d54429a4..0000000000000 --- a/mllib/src/test/scala/org/apache/spark/mllib/rdd/VectorRDDFunctionsSuite.scala +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.spark.mllib.rdd - -import scala.collection.mutable.ArrayBuffer - -import org.scalatest.FunSuite - -import org.apache.spark.mllib.linalg.{Vector, Vectors} -import org.apache.spark.mllib.rdd.VectorRDDFunctionsSuite._ -import org.apache.spark.mllib.util.LocalSparkContext -import org.apache.spark.mllib.util.MLUtils._ - -/** - * Test suite for the summary statistics of RDD[Vector]. Both the accuracy and the time consuming - * between dense and sparse vector are tested. - */ -class VectorRDDFunctionsSuite extends FunSuite with LocalSparkContext { - - val localData = Array( - Vectors.dense(1.0, 2.0, 3.0), - Vectors.dense(4.0, 0.0, 6.0), - Vectors.dense(0.0, 8.0, 9.0) - ) - - val sparseData = ArrayBuffer(Vectors.sparse(3, Seq((0, 1.0)))) - for (i <- 0 until 97) sparseData += Vectors.sparse(3, Seq((2, 0.0))) - sparseData += Vectors.sparse(3, Seq((0, 5.0))) - sparseData += Vectors.sparse(3, Seq((1, 5.0))) - - test("dense statistical summary") { - val data = sc.parallelize(localData, 2) - val summary = data.computeSummaryStatistics() - - assert(equivVector(summary.mean, Vectors.dense(5.0 / 3.0, 10.0 / 3.0, 6.0)), - "Dense column mean do not match.") - - assert(equivVector(summary.variance, Vectors.dense(4.333333333333334, 17.333333333333336, 9.0)), - "Dense column variance do not match.") - - assert(summary.count === 3, "Dense column cnt do not match.") - - assert(equivVector(summary.numNonZeros, Vectors.dense(2.0, 2.0, 3.0)), - "Dense column nnz do not match.") - - assert(equivVector(summary.max, Vectors.dense(4.0, 8.0, 9.0)), - "Dense column max do not match.") - - assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 3.0)), - "Dense column min do not match.") - } - - test("sparse statistical summary") { - val dataForSparse = sc.parallelize(sparseData.toSeq, 2) - val summary = dataForSparse.computeSummaryStatistics() - - assert(equivVector(summary.mean, Vectors.dense(0.06, 0.05, 0.0)), - "Sparse column mean do not match.") - - assert(equivVector(summary.variance, Vectors.dense(0.258989898989899, 0.25, 0.0)), - "Sparse column variance do not match.") - - assert(summary.count === 100, "Sparse column cnt do not match.") - - assert(equivVector(summary.numNonZeros, Vectors.dense(2.0, 1.0, 0.0)), - "Sparse column nnz do not match.") - - assert(equivVector(summary.max, Vectors.dense(5.0, 5.0, 0.0)), - "Sparse column max do not match.") - - assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 0.0)), - "Sparse column min do not match.") - } -} - -object VectorRDDFunctionsSuite { - def equivVector(lhs: Vector, rhs: Vector): Boolean = { - (lhs.toBreeze - rhs.toBreeze).norm(2) < 1e-5 - } -} From cbbefdba993dfcac68b2199a0728d3feaf42445f Mon Sep 17 00:00:00 2001 From: Xiangrui Meng Date: Wed, 9 Apr 2014 12:01:15 -0700 Subject: [PATCH 34/38] update multivariate statistical summary interface and clean tests --- .../mllib/linalg/distributed/RowMatrix.scala | 156 ++++++++---------- .../stat/MultivariateStatisticalSummary.scala | 56 +++++++ .../linalg/distributed/RowMatrixSuite.scala | 56 ++----- 3 files changed, 135 insertions(+), 133 deletions(-) create mode 100644 mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index d970c3db16bf2..14796cf0ed4a8 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -27,67 +27,31 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.mllib.linalg._ import org.apache.spark.rdd.RDD import org.apache.spark.Logging +import org.apache.spark.mllib.stat.MultivariateStatisticalSummary /** - * Trait of the summary statistics, including mean, variance, count, max, min, and non-zero elements - * count. + * Column statistics aggregator implementing + * [[org.apache.spark.mllib.stat.MultivariateStatisticalSummary]] + * together with add() and merge() function. + * A numerically stable algorithm is implemented to compute sample mean and variance: + *[[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]. + * Zero elements (including explicit zero values) are skipped when calling add() and merge(), + * to have time complexity O(nnz) instead of O(n) for each column. */ -trait VectorRDDStatisticalSummary { +private class ColumnStatisticsAggregator(private val n: Int) + extends MultivariateStatisticalSummary with Serializable { - /** - * Computes the mean of columns in RDD[Vector]. - */ - def mean: Vector - - /** - * Computes the sample variance of columns in RDD[Vector]. - */ - def variance: Vector - - /** - * Computes number of vectors in RDD[Vector]. - */ - def count: Long - - /** - * Computes the number of non-zero elements in each column of RDD[Vector]. - */ - def numNonZeros: Vector - - /** - * Computes the maximum of each column in RDD[Vector]. - */ - def max: Vector - - /** - * Computes the minimum of each column in RDD[Vector]. - */ - def min: Vector -} - - -/** - * Aggregates [[org.apache.spark.mllib.linalg.distributed.VectorRDDStatisticalSummary - * VectorRDDStatisticalSummary]] together with add() and merge() function. Online variance solution - * used in add() function, while parallel variance solution used in merge() function. Reference here - * : [[http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance variance-wiki]]. Solution - * here ignoring the zero elements when calling add() and merge(), for decreasing the O(n) algorithm - * to O(nnz). Real variance is computed here after we get other statistics, simply by another - * parallel combination process. - */ -private class VectorRDDStatisticsAggregator( - val currMean: BDV[Double], - val currM2n: BDV[Double], - var totalCnt: Double, - val nnz: BDV[Double], - val currMax: BDV[Double], - val currMin: BDV[Double]) - extends VectorRDDStatisticalSummary with Serializable { + private val currMean: BDV[Double] = BDV.zeros[Double](n) + private val currM2n: BDV[Double] = BDV.zeros[Double](n) + private var totalCnt: Double = 0.0 + private val nnz: BDV[Double] = BDV.zeros[Double](n) + private val currMax: BDV[Double] = BDV.fill(n)(Double.MinValue) + private val currMin: BDV[Double] = BDV.fill(n)(Double.MaxValue) override def mean = { - val realMean = BDV.zeros[Double](currMean.length) + val realMean = BDV.zeros[Double](n) var i = 0 - while (i < currMean.length) { + while (i < n) { realMean(i) = currMean(i) * nnz(i) / totalCnt i += 1 } @@ -95,7 +59,7 @@ private class VectorRDDStatisticsAggregator( } override def variance = { - val realVariance = BDV.zeros[Double](currM2n.length) + val realVariance = BDV.zeros[Double](n) val denominator = totalCnt - 1.0 @@ -116,12 +80,12 @@ private class VectorRDDStatisticsAggregator( override def count: Long = totalCnt.toLong - override def numNonZeros: Vector = Vectors.fromBreeze(nnz) + override def numNonzeros: Vector = Vectors.fromBreeze(nnz) override def max: Vector = { var i = 0 - while (i < nnz.length) { - if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 + while (i < n) { + if ((nnz(i) < totalCnt) && (currMax(i) < 0.0)) currMax(i) = 0.0 i += 1 } Vectors.fromBreeze(currMax) @@ -129,7 +93,7 @@ private class VectorRDDStatisticsAggregator( override def min: Vector = { var i = 0 - while (i < nnz.length) { + while (i < n) { if ((nnz(i) < totalCnt) && (currMin(i) > 0.0)) currMin(i) = 0.0 i += 1 } @@ -137,21 +101,20 @@ private class VectorRDDStatisticsAggregator( } /** - * Aggregate function used for aggregating elements in a worker together. + * Aggregates a row. */ def add(currData: BV[Double]): this.type = { currData.activeIterator.foreach { - // this case is used for filtering the zero elements if the vector. - case (id, 0.0) => - case (id, value) => - if (currMax(id) < value) currMax(id) = value - if (currMin(id) > value) currMin(id) = value + case (_, 0.0) => // Skip explicit zero elements. + case (i, value) => + if (currMax(i) < value) currMax(i) = value + if (currMin(i) > value) currMin(i) = value - val tmpPrevMean = currMean(id) - currMean(id) = (currMean(id) * nnz(id) + value) / (nnz(id) + 1.0) - currM2n(id) += (value - currMean(id)) * (value - tmpPrevMean) + val tmpPrevMean = currMean(i) + currMean(i) = (currMean(i) * nnz(i) + value) / (nnz(i) + 1.0) + currM2n(i) += (value - currMean(i)) * (value - tmpPrevMean) - nnz(id) += 1.0 + nnz(i) += 1.0 } totalCnt += 1.0 @@ -159,16 +122,18 @@ private class VectorRDDStatisticsAggregator( } /** - * Combine function used for combining intermediate results together from every worker. + * Merges another aggregator. */ - def merge(other: VectorRDDStatisticsAggregator): this.type = { + def merge(other: ColumnStatisticsAggregator): this.type = { + + require(n == other.n, s"Dimensions mismatch. Expecting $n but got ${other.n}.") totalCnt += other.totalCnt val deltaMean = currMean - other.currMean var i = 0 - while (i < other.currMean.length) { + while (i < n) { // merge mean together if (other.currMean(i) != 0.0) { currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / @@ -189,6 +154,7 @@ private class VectorRDDStatisticsAggregator( } nnz += other.nnz + this } } @@ -346,13 +312,7 @@ class RowMatrix( combOp = (s1: (Long, BDV[Double]), s2: (Long, BDV[Double])) => (s1._1 + s2._1, s1._2 += s2._2) ) - // Update _m if it is not set, or verify its value. - if (nRows <= 0L) { - nRows = m - } else { - require(nRows == m, - s"The number of rows $m is different from what specified or previously computed: ${nRows}.") - } + updateNumRows(m) mean :/= m.toDouble @@ -405,21 +365,16 @@ class RowMatrix( } /** - * Compute full column-wise statistics for the RDD with the size of Vector as input parameter. + * Computes column-wise summary statistics. */ - def multiVariateSummaryStatistics(): VectorRDDStatisticalSummary = { - val zeroValue = new VectorRDDStatisticsAggregator( - BDV.zeros[Double](nCols), - BDV.zeros[Double](nCols), - 0.0, - BDV.zeros[Double](nCols), - BDV.fill(nCols)(Double.MinValue), - BDV.fill(nCols)(Double.MaxValue)) - - rows.map(_.toBreeze).aggregate[VectorRDDStatisticsAggregator](zeroValue)( + def computeColumnSummaryStatistics(): MultivariateStatisticalSummary = { + val zeroValue = new ColumnStatisticsAggregator(numCols().toInt) + val summary = rows.map(_.toBreeze).aggregate[ColumnStatisticsAggregator](zeroValue)( (aggregator, data) => aggregator.add(data), (aggregator1, aggregator2) => aggregator1.merge(aggregator2) ) + updateNumRows(summary.count) + summary } /** @@ -458,6 +413,27 @@ class RowMatrix( } mat } + + /** Updates or verifies the number of columns. */ + private def updateNumCols(n: Int) { + if (nCols <= 0) { + nCols == n + } else { + require(nCols == n, + s"The number of columns $n is different from " + + s"what specified or previously computed: ${nCols}.") + } + } + + /** Updates or verfires the number of rows. */ + private def updateNumRows(m: Long) { + if (nRows <= 0) { + nRows == m + } else { + require(nRows == m, + s"The number of rows $m is different from what specified or previously computed: ${nRows}.") + } + } } object RowMatrix { diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala new file mode 100644 index 0000000000000..f9eb343da2b82 --- /dev/null +++ b/mllib/src/main/scala/org/apache/spark/mllib/stat/MultivariateStatisticalSummary.scala @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.mllib.stat + +import org.apache.spark.mllib.linalg.Vector + +/** + * Trait for multivariate statistical summary of a data matrix. + */ +trait MultivariateStatisticalSummary { + + /** + * Sample mean vector. + */ + def mean: Vector + + /** + * Sample variance vector. Should return a zero vector if the sample size is 1. + */ + def variance: Vector + + /** + * Sample size. + */ + def count: Long + + /** + * Number of nonzero elements (including explicitly presented zero values) in each column. + */ + def numNonzeros: Vector + + /** + * Maximum value of each column. + */ + def max: Vector + + /** + * Minimum value of each column. + */ + def min: Vector +} diff --git a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala index 19c8a7730cb09..c9f9acf4c1335 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/linalg/distributed/RowMatrixSuite.scala @@ -137,9 +137,6 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { brzNorm(v, 1.0) < 1e-6 } - def equivVector(lhs: Vector, rhs: Vector): Boolean = - closeToZero(lhs.toBreeze.asInstanceOf[BDV[Double]] - rhs.toBreeze.asInstanceOf[BDV[Double]]) - def assertColumnEqualUpToSign(A: BDM[Double], B: BDM[Double], k: Int) { assert(A.rows === B.rows) for (j <- 0 until k) { @@ -174,45 +171,18 @@ class RowMatrixSuite extends FunSuite with LocalSparkContext { } } - test("dense statistical summary") { - val summary = denseMat.multiVariateSummaryStatistics() - - assert(equivVector(summary.mean, Vectors.dense(4.5, 3.0, 4.0)), - "Dense column mean do not match.") - - assert(equivVector(summary.variance, Vectors.dense(15.0, 10.0, 10.0)), - "Dense column variance do not match.") - - assert(summary.count === 4, "Dense column cnt do not match.") - - assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 4.0)), - "Dense column nnz do not match.") - - assert(equivVector(summary.max, Vectors.dense(9.0, 7.0, 8.0)), - "Dense column max do not match.") - - assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 1.0)), - "Dense column min do not match.") - } - - test("sparse statistical summary") { - val summary = sparseMat.multiVariateSummaryStatistics() - - assert(equivVector(summary.mean, Vectors.dense(4.5, 3.0, 4.0)), - "Sparse column mean do not match.") - - assert(equivVector(summary.variance, Vectors.dense(15.0, 10.0, 10.0)), - "Sparse column variance do not match.") - - assert(summary.count === 4, "Sparse column cnt do not match.") - - assert(equivVector(summary.numNonZeros, Vectors.dense(3.0, 3.0, 4.0)), - "Sparse column nnz do not match.") - - assert(equivVector(summary.max, Vectors.dense(9.0, 7.0, 8.0)), - "Sparse column max do not match.") - - assert(equivVector(summary.min, Vectors.dense(0.0, 0.0, 1.0)), - "Sparse column min do not match.") + test("compute column summary statistics") { + for (mat <- Seq(denseMat, sparseMat)) { + val summary = mat.computeColumnSummaryStatistics() + // Run twice to make sure no internal states are changed. + for (k <- 0 to 1) { + assert(summary.mean === Vectors.dense(4.5, 3.0, 4.0), "mean mismatch") + assert(summary.variance === Vectors.dense(15.0, 10.0, 10.0), "variance mismatch") + assert(summary.count === m, "count mismatch.") + assert(summary.numNonzeros === Vectors.dense(3.0, 3.0, 4.0), "nnz mismatch") + assert(summary.max === Vectors.dense(9.0, 7.0, 8.0), "max mismatch") + assert(summary.min === Vectors.dense(0.0, 0.0, 1.0), "column mismatch.") + } + } } } From b064714a6fa1314e98947cd1dd99cfd5fa6cf593 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 10 Apr 2014 08:22:56 +0800 Subject: [PATCH 35/38] remove computeStat in MLUtils --- .../scala/org/apache/spark/mllib/util/MLUtils.scala | 7 +++---- .../org/apache/spark/mllib/util/MLUtilsSuite.scala | 13 ------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index ac2360c429e2b..94b343d8148cd 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -17,14 +17,13 @@ package org.apache.spark.mllib.util -import breeze.linalg.{Vector => BV, DenseVector => BDV, SparseVector => BSV, - squaredDistance => breezeSquaredDistance} +import breeze.linalg.{Vector => BV, SparseVector => BSV, squaredDistance => breezeSquaredDistance} import org.apache.spark.annotation.Experimental import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint -import org.apache.spark.mllib.linalg.{Vector, Vectors} +import org.apache.spark.mllib.linalg.Vectors /** * Helper methods to load, save and pre-process data used in ML Lib. @@ -170,7 +169,7 @@ object MLUtils { * xColMean - Row vector with mean for every column (or feature) of the input data * xColSd - Row vector standard deviation for every column (or feature) of the input data. */ - private[mllib] def computeStats( + def computeStats( data: RDD[LabeledPoint], numFeatures: Int, numExamples: Long): (Double, Vector, Vector) = { diff --git a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala index e451c350b8d88..812a8434784be 100644 --- a/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/mllib/util/MLUtilsSuite.scala @@ -27,7 +27,6 @@ import com.google.common.base.Charsets import com.google.common.io.Files import org.apache.spark.mllib.linalg.Vectors -import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.util.MLUtils._ class MLUtilsSuite extends FunSuite with LocalSparkContext { @@ -56,18 +55,6 @@ class MLUtilsSuite extends FunSuite with LocalSparkContext { } } - test("compute stats") { - val data = Seq.fill(3)(Seq( - LabeledPoint(1.0, Vectors.dense(1.0, 2.0, 3.0)), - LabeledPoint(0.0, Vectors.dense(3.0, 4.0, 5.0)) - )).flatten - val rdd = sc.parallelize(data, 2) - val (meanLabel, mean, std) = MLUtils.computeStats(rdd, 3, 6) - assert(meanLabel === 0.5) - assert(mean === Vectors.dense(2.0, 3.0, 4.0)) - assert(std === Vectors.dense(1.0, 1.0, 1.0)) - } - test("loadLibSVMData") { val lines = """ From 10cf5d32cb71c3649a082e3cbd7027b76ca442c7 Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 10 Apr 2014 12:14:15 +0800 Subject: [PATCH 36/38] refine some return type --- .../apache/spark/mllib/linalg/distributed/RowMatrix.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index 14796cf0ed4a8..b7d8fd90fbf8e 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -43,12 +43,12 @@ private class ColumnStatisticsAggregator(private val n: Int) private val currMean: BDV[Double] = BDV.zeros[Double](n) private val currM2n: BDV[Double] = BDV.zeros[Double](n) - private var totalCnt: Double = 0.0 + private var totalCnt = 0.0 private val nnz: BDV[Double] = BDV.zeros[Double](n) private val currMax: BDV[Double] = BDV.fill(n)(Double.MinValue) private val currMin: BDV[Double] = BDV.fill(n)(Double.MaxValue) - override def mean = { + override def mean: Vector = { val realMean = BDV.zeros[Double](n) var i = 0 while (i < n) { @@ -58,7 +58,7 @@ private class ColumnStatisticsAggregator(private val n: Int) Vectors.fromBreeze(realMean) } - override def variance = { + override def variance: Vector = { val realVariance = BDV.zeros[Double](n) val denominator = totalCnt - 1.0 From 16ae684adb8e49053349d512535ec9a4c4154dcb Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Thu, 10 Apr 2014 13:43:31 +0800 Subject: [PATCH 37/38] fix minor error and remove useless method --- .../mllib/linalg/distributed/RowMatrix.scala | 38 +++++++------------ 1 file changed, 14 insertions(+), 24 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index b7d8fd90fbf8e..ae9575ba5b3ef 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -63,8 +63,8 @@ private class ColumnStatisticsAggregator(private val n: Int) val denominator = totalCnt - 1.0 - // Sample variance is computed, if the denominator is 0, the variance is just 0. - if (denominator != 0.0) { + // Sample variance is computed, if the denominator is less than 0, the variance is just 0. + if (denominator > 0.0) { val deltaMean = currMean var i = 0 while (i < currM2n.size) { @@ -107,8 +107,12 @@ private class ColumnStatisticsAggregator(private val n: Int) currData.activeIterator.foreach { case (_, 0.0) => // Skip explicit zero elements. case (i, value) => - if (currMax(i) < value) currMax(i) = value - if (currMin(i) > value) currMin(i) = value + if (currMax(i) < value) { + currMax(i) = value + } + if (currMin(i) > value) { + currMin(i) = value + } val tmpPrevMean = currMean(i) currMean(i) = (currMean(i) * nnz(i) + value) / (nnz(i) + 1.0) @@ -125,11 +129,9 @@ private class ColumnStatisticsAggregator(private val n: Int) * Merges another aggregator. */ def merge(other: ColumnStatisticsAggregator): this.type = { - require(n == other.n, s"Dimensions mismatch. Expecting $n but got ${other.n}.") totalCnt += other.totalCnt - val deltaMean = currMean - other.currMean var i = 0 @@ -139,22 +141,21 @@ private class ColumnStatisticsAggregator(private val n: Int) currMean(i) = (currMean(i) * nnz(i) + other.currMean(i) * other.nnz(i)) / (nnz(i) + other.nnz(i)) } - // merge m2n together if (nnz(i) + other.nnz(i) != 0.0) { currM2n(i) += other.currM2n(i) + deltaMean(i) * deltaMean(i) * nnz(i) * other.nnz(i) / (nnz(i) + other.nnz(i)) } - - if (currMax(i) < other.currMax(i)) currMax(i) = other.currMax(i) - - if (currMin(i) > other.currMin(i)) currMin(i) = other.currMin(i) - + if (currMax(i) < other.currMax(i)) { + currMax(i) = other.currMax(i) + } + if (currMin(i) > other.currMin(i)) { + currMin(i) = other.currMin(i) + } i += 1 } nnz += other.nnz - this } } @@ -414,17 +415,6 @@ class RowMatrix( mat } - /** Updates or verifies the number of columns. */ - private def updateNumCols(n: Int) { - if (nCols <= 0) { - nCols == n - } else { - require(nCols == n, - s"The number of columns $n is different from " + - s"what specified or previously computed: ${nCols}.") - } - } - /** Updates or verfires the number of rows. */ private def updateNumRows(m: Long) { if (nRows <= 0) { From d61363f9965b6792efb40d4af6f7932d219dfd4d Mon Sep 17 00:00:00 2001 From: Xusen Yin Date: Fri, 11 Apr 2014 06:46:23 +0800 Subject: [PATCH 38/38] rebase to latest master --- .../mllib/linalg/distributed/RowMatrix.scala | 1 + .../org/apache/spark/mllib/util/MLUtils.scala | 52 ------------------- 2 files changed, 1 insertion(+), 52 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala index ae9575ba5b3ef..0c0afcd9ec0d7 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/linalg/distributed/RowMatrix.scala @@ -161,6 +161,7 @@ private class ColumnStatisticsAggregator(private val n: Int) } /** + * :: Experimental :: * Represents a row-oriented distributed Matrix with no meaningful row indices. * * @param rows rows stored as an RDD[Vector] diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala index 94b343d8148cd..901c3180eac4c 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala @@ -157,58 +157,6 @@ object MLUtils { dataStr.saveAsTextFile(dir) } - /** - * Utility function to compute mean and standard deviation on a given dataset. - * - * @param data - input data set whose statistics are computed - * @param numFeatures - number of features - * @param numExamples - number of examples in input dataset - * - * @return (yMean, xColMean, xColSd) - Tuple consisting of - * yMean - mean of the labels - * xColMean - Row vector with mean for every column (or feature) of the input data - * xColSd - Row vector standard deviation for every column (or feature) of the input data. - */ - def computeStats( - data: RDD[LabeledPoint], - numFeatures: Int, - numExamples: Long): (Double, Vector, Vector) = { - val brzData = data.map { case LabeledPoint(label, features) => - (label, features.toBreeze) - } - val aggStats = brzData.aggregate( - (0L, 0.0, BDV.zeros[Double](numFeatures), BDV.zeros[Double](numFeatures)) - )( - seqOp = (c, v) => (c, v) match { - case ((n, sumLabel, sum, sumSq), (label, features)) => - features.activeIterator.foreach { case (i, x) => - sumSq(i) += x * x - } - (n + 1L, sumLabel + label, sum += features, sumSq) - }, - combOp = (c1, c2) => (c1, c2) match { - case ((n1, sumLabel1, sum1, sumSq1), (n2, sumLabel2, sum2, sumSq2)) => - (n1 + n2, sumLabel1 + sumLabel2, sum1 += sum2, sumSq1 += sumSq2) - } - ) - val (nl, sumLabel, sum, sumSq) = aggStats - - require(nl > 0, "Input data is empty.") - require(nl == numExamples) - - val n = nl.toDouble - val yMean = sumLabel / n - val mean = sum / n - val std = new Array[Double](sum.length) - var i = 0 - while (i < numFeatures) { - std(i) = sumSq(i) / n - mean(i) * mean(i) - i += 1 - } - - (yMean, Vectors.fromBreeze(mean), Vectors.dense(std)) - } - /** * Returns the squared Euclidean distance between two vectors. The following formula will be used * if it does not introduce too much numerical error: