-
Notifications
You must be signed in to change notification settings - Fork 393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Enable Html stripping #478
Changes from 5 commits
8805e3e
ee87945
20343a7
a5f8591
0f01904
59cf931
5093c66
ac0f692
7655e2c
7086050
740b727
a9f44b2
262efd4
a6e4045
5b17167
072d9b9
715e1da
2a91be0
f974c39
29a343d
e01d3e2
0983191
9143056
b48717a
6cf6e96
91a7095
d66f976
49a81fd
c5c7f8a
97b9ce8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -342,8 +342,10 @@ final class SmartTextVectorizerModel[T <: Text] private[op] | |
val textToHash = groups.getOrElse(TextVectorizationMethod.Hash, Array.empty).map(_._1) | ||
|
||
val categoricalVector: OPVector = categoricalPivotFn(textToPivot) | ||
val textTokens: Seq[TextList] = textToHash.map(tokenize(_).tokens) | ||
val ignorableTextTokens: Seq[TextList] = textToIgnore.map(tokenize(_).tokens) | ||
val textTokens: Seq[TextList] = textToHash.map(t => tokenize(text = t, | ||
analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens) | ||
val ignorableTextTokens: Seq[TextList] = textToIgnore.map(t => tokenize(text = t, | ||
analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here, make this settable |
||
val textVector: OPVector = hash[TextList](textTokens, getTextTransientFeatures, args.hashingParams) | ||
val textNullIndicatorsVector = if (args.shouldTrackNulls) { | ||
getNullIndicatorsVector(textTokens ++ ignorableTextTokens) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,7 +53,7 @@ private[op] trait TransmogrifierDefaults { | |
val NullString: String = OpVectorColumnMetadata.NullString | ||
val OtherString: String = OpVectorColumnMetadata.OtherString | ||
val DefaultNumOfFeatures: Int = 512 | ||
val MaxNumOfFeatures: Int = 16384 | ||
val MaxNumOfFeatures: Int = 1638400000 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the above number is 2^14 please choose a number in bytes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think we don't need to change this number, because i lifted this limit in #477 |
||
val DateListDefault: DateListPivot = DateListPivot.SinceLast | ||
val ReferenceDate: org.joda.time.DateTime = DateTimeUtils.now() | ||
val TopK: Int = 20 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -607,10 +607,9 @@ class SmartTextMapVectorizerTest | |
val transformed = new OpWorkflow().setResultFeatures(output).transform(countryMapDF) | ||
assertVectorLength(transformed, output, TransmogrifierDefaults.TopK + 2, Pivot) | ||
} | ||
it should "treat the edge case of coverage being near 1" in { | ||
it should "treat the edge case of coverage being 1" in { | ||
val maxCard = 100 | ||
val vectorizer = new SmartTextMapVectorizer().setCoveragePct(1.0 - 1e-10).setMaxCardinality(maxCard) | ||
.setMinSupport(1) | ||
val vectorizer = new SmartTextMapVectorizer().setCoveragePct(1.0).setMaxCardinality(maxCard).setMinSupport(1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these seems to defeat the purpose of the test |
||
.setTrackTextLen(true).setInput(rawCatCountryMap) | ||
val output = vectorizer.getOutput() | ||
val transformed = new OpWorkflow().setResultFeatures(output).transform(countryMapDF) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rather than just changing the default please make this a settable parameter