Skip to content

Commit

Permalink
update test
Browse files Browse the repository at this point in the history
  • Loading branch information
mengxr committed Mar 24, 2015
1 parent f96526d commit 9651aec
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ class RegexTokenizer extends UnaryTransformer[String, Seq[String], RegexTokenize
* param sets regex as splitting on gaps (true) or matching tokens (false)
* @group param
*/
val gaps: BooleanParam = new BooleanParam(this, "gaps",
"Set regex to match gaps or tokens", Some(false))
val gaps: BooleanParam = new BooleanParam(
this, "gaps", "Set regex to match gaps or tokens", Some(false))

/** @group setParam */
def setGaps(value: Boolean): this.type = set(gaps, value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ public void regexTokenizer() {
.setGaps(true)
.setMinTokenLength(3);

JavaRDD<TextData> rdd = jsc.parallelize(Lists.newArrayList(
new TextData("Test of tok.", new String[] {"Test", "tok."}),
new TextData("Te,st. punct", new String[] {"Te,st.", "punct"})
JavaRDD<TokenizerTestData> rdd = jsc.parallelize(Lists.newArrayList(
new TokenizerTestData("Test of tok.", new String[] {"Test", "tok."}),
new TokenizerTestData("Te,st. punct", new String[] {"Te,st.", "punct"})
));
DataFrame dataset = jsql.createDataFrame(rdd, TextData.class);
DataFrame dataset = jsql.createDataFrame(rdd, TokenizerTestData.class);

Row[] pairs = myRegExTokenizer.transform(dataset)
.select("tokens","wantedTokens")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

@BeanInfo
case class TextData(rawText: String, wantedTokens: Seq[String]) {
case class TokenizerTestData(rawText: String, wantedTokens: Seq[String]) {
/** Constructor used in [[org.apache.spark.ml.feature.JavaTokenizerSuite]] */
def this(rawText: String, wantedTokens: Array[String]) = this(rawText, wantedTokens.toSeq)
}
Expand All @@ -46,14 +46,14 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
.setOutputCol("tokens")

val dataset0 = sqlContext.createDataFrame(Seq(
TextData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")),
TextData("Te,st. punct", Seq("Te", ",", "st", ".", "punct"))
TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization", ".")),
TokenizerTestData("Te,st. punct", Seq("Te", ",", "st", ".", "punct"))
))
testRegexTokenizer(tokenizer, dataset0)

val dataset1 = sqlContext.createDataFrame(Seq(
TextData("Test for tokenization.", Seq("Test", "for", "tokenization")),
TextData("Te,st. punct", Seq("punct"))
TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization")),
TokenizerTestData("Te,st. punct", Seq("punct"))
))

tokenizer.setMinTokenLength(3)
Expand All @@ -64,8 +64,8 @@ class RegexTokenizerSuite extends FunSuite with MLlibTestSparkContext {
.setGaps(true)
.setMinTokenLength(0)
val dataset2 = sqlContext.createDataFrame(Seq(
TextData("Test for tokenization.", Seq("Test", "for", "tokenization.")),
TextData("Te,st. punct", Seq("Te,st.", "", "punct"))
TokenizerTestData("Test for tokenization.", Seq("Test", "for", "tokenization.")),
TokenizerTestData("Te,st. punct", Seq("Te,st.", "", "punct"))
))
testRegexTokenizer(tokenizer, dataset2)
}
Expand Down

0 comments on commit 9651aec

Please sign in to comment.