diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala index 498ddf6d2a..9e4222b741 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichMapFeature.scala @@ -277,6 +277,7 @@ trait RichMapFeature { * options are from the full entry or from the tokens * @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to * be hashed instead of ignored + * @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing * @param others additional text features * @return result feature of type Vector */ @@ -304,6 +305,7 @@ trait RichMapFeature { hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, textLengthType: TextLengthType = SmartTextVectorizer.LengthType, minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev, + stripHtml: Boolean = TextTokenizer.StripHtml, others: Array[FeatureLike[TextMap]] = Array.empty ): FeatureLike[OPVector] = { // scalastyle:on parameter.number @@ -318,6 +320,7 @@ trait RichMapFeature { .setAutoDetectThreshold(autoDetectThreshold) .setDefaultLanguage(defaultLanguage) .setMinTokenLength(minTokenLength) + .setStripHtml(stripHtml) .setToLowercase(toLowercase) .setTopK(topK) .setMinSupport(minSupport) @@ -426,10 +429,9 @@ trait RichMapFeature { * @param defaultLanguage default language to assume in case autoDetectLanguage is disabled or * failed to make a good enough prediction. * @param hashAlgorithm hash algorithm to use - * @param tokenizeForLengths If true, then the length counts will be lengths of the tokens in the entries. - * If false, then the length counts will be the lengths of the entire entries * @param minLengthStdDev minimum standard deviation of the lengths of tokens in a text field for it to * be hashed instead of ignored + * @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing * @param others additional text features * @return result feature of type Vector */ @@ -457,6 +459,7 @@ trait RichMapFeature { hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, textLengthType: TextLengthType = SmartTextVectorizer.LengthType, minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev, + stripHtml: Boolean = TextTokenizer.StripHtml, others: Array[FeatureLike[TextAreaMap]] = Array.empty ): FeatureLike[OPVector] = { // scalastyle:on parameter.number @@ -471,6 +474,7 @@ trait RichMapFeature { .setAutoDetectThreshold(autoDetectThreshold) .setDefaultLanguage(defaultLanguage) .setMinTokenLength(minTokenLength) + .setStripHtml(stripHtml) .setToLowercase(toLowercase) .setTopK(topK) .setMinSupport(minSupport) diff --git a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala index 6f6d7d9876..8b204f7939 100644 --- a/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala +++ b/core/src/main/scala/com/salesforce/op/dsl/RichTextFeature.scala @@ -115,6 +115,7 @@ trait RichTextFeature { * confidence greater than the threshold then defaultLanguage is used. * @param hashSpaceStrategy strategy to determine whether to use shared hash space for all included features * @param minTokenLength minimum token length, >= 1. + * @param stripHtml indicates whether to strip HTML tags from the text or not before analyzing * @param trackNulls indicates whether or not to track null values in a separate column. * Since features may be combined into a shared hash space here, the null value * should be tracked separately @@ -137,6 +138,7 @@ trait RichTextFeature { autoDetectLanguage: Boolean, minTokenLength: Int, toLowercase: Boolean, + stripHtml: Boolean = TextTokenizer.StripHtml, trackNulls: Boolean = TransmogrifierDefaults.TrackNulls, trackTextLen: Boolean = TransmogrifierDefaults.TrackTextLen, hashWithIndex: Boolean = TransmogrifierDefaults.HashWithIndex, @@ -153,7 +155,7 @@ trait RichTextFeature { // scalastyle:on parameter.number val tokenized = (f +: others).map(_.tokenize( languageDetector = languageDetector, - analyzer = analyzer, + analyzer = if (stripHtml) TextTokenizer.AnalyzerHtmlStrip else analyzer, autoDetectLanguage = autoDetectLanguage, autoDetectThreshold = autoDetectThreshold, defaultLanguage = defaultLanguage, @@ -241,6 +243,7 @@ trait RichTextFeature { hashAlgorithm: HashAlgorithm = TransmogrifierDefaults.HashAlgorithm, textLengthType: TextLengthType = SmartTextVectorizer.LengthType, minLengthStdDev: Double = SmartTextVectorizer.MinTextLengthStdDev, + stripHtml: Boolean = TextTokenizer.StripHtml, others: Array[FeatureLike[T]] = Array.empty ): FeatureLike[OPVector] = { // scalastyle:on parameter.number @@ -254,6 +257,7 @@ trait RichTextFeature { .setAutoDetectThreshold(autoDetectThreshold) .setDefaultLanguage(defaultLanguage) .setMinTokenLength(minTokenLength) + .setStripHtml(stripHtml) .setToLowercase(toLowercase) .setTopK(topK) .setMinSupport(minSupport) @@ -375,7 +379,7 @@ trait RichTextFeature { minTokenLength: Int = TextTokenizer.MinTokenLength, toLowercase: Boolean = TextTokenizer.ToLowercase ): FeatureLike[TextList] = { - + // html stripping won't work here due since LuceneRegexTextAnalyzer tokenize( languageDetector = TextTokenizer.LanguageDetector, analyzer = new LuceneRegexTextAnalyzer(pattern, group), diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala index 3990f01eb7..3d75b4c8cf 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizer.scala @@ -221,6 +221,7 @@ class SmartTextMapVectorizer[T <: OPMap[String]] .setMinTokenLength(getMinTokenLength) .setToLowercase(getToLowercase) .setTrackTextLen($(trackTextLen)) + .setStripHtml(getStripHtml) } } diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala index 2874aef29e..60e8a53c15 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizer.scala @@ -148,6 +148,7 @@ class SmartTextVectorizer[T <: Text](uid: String = UID[SmartTextVectorizer[T]])( .setMinTokenLength(getMinTokenLength) .setToLowercase(getToLowercase) .setTrackTextLen($(trackTextLen)) + .setStripHtml(getStripHtml) } private def makeVectorMetadata(smartTextParams: SmartTextVectorizerModelArgs): OpVectorMetadata = { diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala index 2093c28f26..710874c27a 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/TextTokenizer.scala @@ -83,18 +83,24 @@ trait TextTokenizerParams extends LanguageDetectionParams with TextMatchingParam def setMinTokenLength(value: Int): this.type = set(minTokenLength, value) def getMinTokenLength: Int = $(minTokenLength) + final val stripHtml = + new BooleanParam(this, "stripHtml", "enable html stripping") + def setStripHtml(value: Boolean): this.type = set(stripHtml, value) + def getStripHtml: Boolean = $(stripHtml) + setDefault( minTokenLength -> TextTokenizer.MinTokenLength, toLowercase -> TextTokenizer.ToLowercase, autoDetectLanguage -> TextTokenizer.AutoDetectLanguage, autoDetectThreshold -> TextTokenizer.AutoDetectThreshold, - defaultLanguage -> TextTokenizer.DefaultLanguage.entryName + defaultLanguage -> TextTokenizer.DefaultLanguage.entryName, + stripHtml -> TextTokenizer.StripHtml ) def tokenize( text: Text, languageDetector: LanguageDetector = TextTokenizer.LanguageDetector, - analyzer: TextAnalyzer = TextTokenizer.Analyzer + analyzer: TextAnalyzer = if (getStripHtml) TextTokenizer.AnalyzerHtmlStrip else TextTokenizer.Analyzer ): TextTokenizerResult = TextTokenizer.tokenize( text = text, languageDetector = languageDetector, diff --git a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala index f50f8f1726..336a807dff 100644 --- a/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala +++ b/core/src/main/scala/com/salesforce/op/stages/impl/feature/Transmogrifier.scala @@ -53,7 +53,7 @@ private[op] trait TransmogrifierDefaults { val NullString: String = OpVectorColumnMetadata.NullString val OtherString: String = OpVectorColumnMetadata.OtherString val DefaultNumOfFeatures: Int = 512 - val MaxNumOfFeatures: Int = 16384 + val MaxNumOfFeatures: Int = 1 << 17 // 2^17 val DateListDefault: DateListPivot = DateListPivot.SinceLast val ReferenceDate: org.joda.time.DateTime = DateTimeUtils.now() val TopK: Int = 20 diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala index 190826675a..6dce0fa01c 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextMapVectorizerTest.scala @@ -772,6 +772,16 @@ class SmartTextMapVectorizerTest checkDerivedQuantities(res, "f2", Seq(4, 5, 5, 5, 3).map(_.toLong)) } + it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" + + "inside SmartTextMapVectorizer" in { + val exampleHTML = "Big ones, small

ones

, some as big as your head".toText + val tokensWithFlag = new SmartTextMapVectorizer() + .setStripHtml(true).setInput(m1).tokenize(exampleHTML).tokens.value + val tokensWithAnalyzer = new SmartTextMapVectorizer().setInput(m1) + .tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value + tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer + } + private[op] def assertVectorLength(df: DataFrame, output: FeatureLike[OPVector], expectedLength: Int, textVectorizationMethod: TextVectorizationMethod): Unit = { val result = df.collect(output) diff --git a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala index 3f12702c1b..06b43ac3d1 100644 --- a/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala +++ b/core/src/test/scala/com/salesforce/op/stages/impl/feature/SmartTextVectorizerTest.scala @@ -712,4 +712,14 @@ class SmartTextVectorizerTest ts.lengthStdDev.isNaN shouldBe true } + it should "turn on stripHTML flag is equivalent to passing in a custom AnalyzerHtmlStrip" + + "inside SmartTextVectorizer" in { + val exampleHTML = "Big ones, small

ones

, some as big as your head".toText + val tokensWithFlag = new SmartTextVectorizer() + .setStripHtml(true).setInput(f1).tokenize(exampleHTML).tokens.value + val tokensWithAnalyzer = new SmartTextVectorizer().setInput(f1) + .tokenize(exampleHTML, analyzer = TextTokenizer.AnalyzerHtmlStrip).tokens.value + tokensWithFlag should contain theSameElementsInOrderAs tokensWithAnalyzer + } + }