Skip to content

Commit

Permalink
Style corrections
Browse files Browse the repository at this point in the history
  • Loading branch information
Augustin Borsu committed Mar 19, 2015
1 parent 38b95a1 commit 6a85982
Showing 1 changed file with 27 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,32 +19,31 @@ package org.apache.spark.ml.feature

import org.scalatest.FunSuite

import org.apache.spark.SparkException
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row, SQLContext}


case class TextData(rawText : String,wantedTokens: Seq[String])
case class TextData(rawText : String, wantedTokens: Seq[String])
class TokenizerSuite extends FunSuite with MLlibTestSparkContext {

@transient var sqlContext: SQLContext = _
@transient var dataset: DataFrame = _

override def beforeAll(): Unit = {
super.beforeAll()
sqlContext = new SQLContext(sc)
}

test("RegexTokenizer"){
var myRegExTokenizer = new RegexTokenizer()
val myRegExTokenizer = new RegexTokenizer()
.setInputCol("rawText")
.setOutputCol("tokens")

dataset = sqlContext.createDataFrame(
var dataset = sqlContext.createDataFrame(
sc.parallelize(List(
TextData("Test for tokenization.",List("Test","for","tokenization",".")),
TextData("Te,st. punct",List("Te",",","st",".","punct"))
)))
testTokenizer(myRegExTokenizer,dataset)
testRegexTokenizer(myRegExTokenizer,dataset)

dataset = sqlContext.createDataFrame(
sc.parallelize(List(
Expand All @@ -53,7 +52,7 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext {
)))
myRegExTokenizer.asInstanceOf[RegexTokenizer]
.setMinTokenLength(3)
testTokenizer(myRegExTokenizer,dataset)
testRegexTokenizer(myRegExTokenizer,dataset)

myRegExTokenizer.asInstanceOf[RegexTokenizer]
.setPattern("\\s")
Expand All @@ -64,31 +63,41 @@ class TokenizerSuite extends FunSuite with MLlibTestSparkContext {
TextData("Test for tokenization.",List("Test","for","tokenization.")),
TextData("Te,st. punct",List("Te,st.","","punct"))
)))
testTokenizer(myRegExTokenizer,dataset)
testRegexTokenizer(myRegExTokenizer,dataset)
}

test("Tokenizer"){
test("Tokenizer") {
val oldTokenizer = new Tokenizer()
.setInputCol("rawText")
.setOutputCol("tokens")
dataset = sqlContext.createDataFrame(
var dataset = sqlContext.createDataFrame(
sc.parallelize(List(
TextData("Test for tokenization.",List("test","for","tokenization.")),
TextData("Te,st. punct",List("te,st.","","punct"))
)))
testTokenizer(oldTokenizer,dataset)
}

def testTokenizer(t: Tokenizer,dataset: DataFrame){
t.transform(dataset)
def testTokenizer(t: Tokenizer,dataset: DataFrame): Unit = {
t.transform(dataset)
.select("tokens","wantedTokens")
.collect().foreach{
case Row(tokens: Seq[String], wantedTokens: Seq[String]) =>
assert(tokens.length == wantedTokens.length)
tokens.zip(wantedTokens).foreach(x => assert(x._1 == x._2))
case _ =>
println()
assert(false)
case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) =>
assert(tokens === wantedTokens)
case e =>
throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns")
}
}

def testRegexTokenizer(t: RegexTokenizer,dataset: DataFrame): Unit = {
t.transform(dataset)
.select("tokens","wantedTokens")
.collect().foreach{
case Row(tokens: Seq[Any], wantedTokens: Seq[Any]) =>
assert(tokens === wantedTokens)
case e =>
throw new SparkException(s"Row $e should contain only tokens and wantedTokens columns")
}
}

}

0 comments on commit 6a85982

Please sign in to comment.