diff --git a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommender.java b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommender.java index 6ea8b5a60f0..0a773f7680a 100644 --- a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommender.java +++ b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommender.java @@ -40,6 +40,8 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -65,6 +67,7 @@ import de.tudarmstadt.ukp.inception.recommendation.api.recommender.RecommenderContext.Key; import de.tudarmstadt.ukp.inception.recommendation.imls.stringmatch.span.gazeteer.GazeteerService; import de.tudarmstadt.ukp.inception.recommendation.imls.stringmatch.span.gazeteer.model.GazeteerEntry; +import de.tudarmstadt.ukp.inception.recommendation.imls.stringmatch.span.trie.KeySanitizerFactory; import de.tudarmstadt.ukp.inception.recommendation.imls.stringmatch.span.trie.Trie; import de.tudarmstadt.ukp.inception.recommendation.imls.stringmatch.span.trie.WhitespaceNormalizingSanitizer; import de.tudarmstadt.ukp.inception.rendering.model.Range; @@ -87,6 +90,11 @@ public class StringMatchingRecommender private final GazeteerService gazeteerService; + private final KeySanitizerFactory keySanitizerFactory; + + private Pattern excludePattern; + private String excludePatternError; + public StringMatchingRecommender(Recommender aRecommender, StringMatchingRecommenderTraits aTraits) { @@ -100,6 +108,19 @@ public StringMatchingRecommender(Recommender aRecommender, traits = aTraits; gazeteerService = aGazeteerService; + keySanitizerFactory = WhitespaceNormalizingSanitizer.factory(); + + if (traits != null && traits.getExcludePattern() != null) { + try { + excludePattern = Pattern.compile(traits.getExcludePattern()); + } + catch (PatternSyntaxException e) { + excludePatternError = e.getMessage(); + } + } + else { + excludePattern = null; + } } @Override @@ -136,7 +157,7 @@ public void pretrain(List aData, RecommenderContext aContext) if (aData != null) { for (var entry : aData) { - learn(dict, entry.text, entry.label); + learn(dict, entry.text, entry.label, true); } aContext.log(LogMessage.info(getRecommender().getName(), @@ -148,12 +169,17 @@ public void pretrain(List aData, RecommenderContext aContext) private Trie createTrie() { - return new Trie<>(WhitespaceNormalizingSanitizer.factory()); + return new Trie<>(keySanitizerFactory); } @Override public void train(RecommenderContext aContext, List aCasses) throws RecommendationException { + if (excludePatternError != null) { + aContext.log(LogMessage.error(getRecommender().getName(), + "Ignoring bad exclude pattern: %s", excludePatternError)); + } + // Pre-load the gazeteers into the model if (gazeteerService != null) { for (var gaz : gazeteerService.listGazeteers(recommender)) { @@ -185,13 +211,13 @@ public void train(RecommenderContext aContext, List aCasses) throws Recomme var labels = FSUtil.getFeature(ann, predictedFeature, String[].class); if (labels != null) { for (var label : labels) { - learn(dict, ann.getCoveredText(), label); + learn(dict, ann.getCoveredText(), label, false); } } } else { - learn(dict, ann.getCoveredText(), - ann.getFeatureValueAsString(predictedFeature)); + learn(dict, ann.getCoveredText(), ann.getFeatureValueAsString(predictedFeature), + false); } } } @@ -356,7 +382,7 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) Trie dict = createTrie(); for (var sample : trainingSet) { for (var span : sample.getSpans()) { - learn(dict, span.text(), span.label()); + learn(dict, span.text(), span.label(), false); } } @@ -391,12 +417,22 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) SAMPLE_UNIT.getSimpleName(), trainingSetSize, testSetSize, trainRatio, NO_LABEL)); } - private void learn(Trie aDict, String aText, String aLabel) + private void learn(Trie aDict, String aText, String aLabel, boolean aBypassLimits) { if (isBlank(aText)) { return; } + if (!aBypassLimits && traits != null) { + if (excludePattern != null && excludePattern.matcher(aText).matches()) { + return; + } + + if (keySanitizerFactory.create().sanitize(aText).length() < traits.getMinLength()) { + return; + } + } + var label = isBlank(aLabel) ? BLANK_LABEL : aLabel; var text = aText; diff --git a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommenderTraits.java b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommenderTraits.java index b5ddca27da7..8694915c8f6 100644 --- a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommenderTraits.java +++ b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/StringMatchingRecommenderTraits.java @@ -29,10 +29,16 @@ public class StringMatchingRecommenderTraits implements Serializable { - private static final long serialVersionUID = -7433406243352691789L; + private static final long serialVersionUID = -7329491581513178640L; private boolean ignoreCase; + private String excludePattern; + + private int minLength = 3; + + // private int maxLength = 255; + public boolean isIgnoreCase() { return ignoreCase; @@ -42,4 +48,35 @@ public void setIgnoreCase(boolean aIgnoreCase) { ignoreCase = aIgnoreCase; } + + public int getMinLength() + { + return minLength; + } + + public void setMinLength(int aMinLength) + { + minLength = aMinLength; + } + + public String getExcludePattern() + { + return excludePattern; + } + + public void setExcludePattern(String aExcludePattern) + { + excludePattern = aExcludePattern; + } + + // public int getMaxLength() + // { + // return maxLength; + // } + // + // public void setMaxLength(int aMaxLength) + // { + // maxLength = aMaxLength; + // } + } diff --git a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/settings/StringMatchingRecommenderTraitsEditor.html b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/settings/StringMatchingRecommenderTraitsEditor.html index e943b960989..f689fb93730 100644 --- a/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/settings/StringMatchingRecommenderTraitsEditor.html +++ b/inception/inception-imls-stringmatch/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/stringmatch/span/settings/StringMatchingRecommenderTraitsEditor.html @@ -21,7 +21,7 @@
-
+
+
+ +
+
+ + (.*) +
+
+ Regular expression that controls what can be added to the dictionary. +
+
+
+
+ +
+ +
+ Minimum length for dictionary entries. Shorter entries will not be added. +
+
+