From 954a6ac73888e1d157466acfd0f972ee9264ca67 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 27 Dec 2023 16:50:57 +0100 Subject: [PATCH 1/5] #4399 - Allow OpenNLP Multi-Token Sequence Classifier to work for cross-sentence layers - Towards supporting a sliding-window approach --- .../opennlp/ner/OpenNlpNerRecommender.java | 103 +++++++++++++++--- .../ner/OpenNlpNerRecommenderFactory.java | 7 +- .../ner/OpenNlpNerRecommenderTest.java | 71 ++++++------ 3 files changed, 122 insertions(+), 59 deletions(-) diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java index 8b20115d904..8348f255a00 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java @@ -27,7 +27,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import java.util.Objects; @@ -219,7 +218,7 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) return result; } - LOG.info("Training on [{}] sentences, predicting on [{}] of total [{}]", trainingSet.size(), + LOG.info("Training on [{}] samples, predicting on [{}] of total [{}]", trainingSet.size(), testSet.size(), data.size()); // Train model @@ -236,11 +235,11 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) nameFinder.clearAdaptiveData(); // Span contains one NE, Array of them all in one sentence - var sentence = sample.getSentence(); - var predictedNames = nameFinder.find(sentence); + var sampleTokens = sample.getSentence(); + var predictedNames = nameFinder.find(sampleTokens); var goldNames = sample.getNames(); - labelPairs.addAll(determineLabelsForASentence(sentence, predictedNames, goldNames)); + labelPairs.addAll(determineLabelsForSample(sampleTokens, predictedNames, goldNames)); } return labelPairs.stream().collect(toEvaluationResult(DATAPOINT_UNIT.getSimpleName(), @@ -251,7 +250,7 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) * Extract AnnotatedTokenPairs with info on predicted and gold label for each token of the given * sentence. */ - private List determineLabelsForASentence(String[] sentence, Span[] predictedNames, + private List determineLabelsForSample(String[] sentence, Span[] predictedNames, Span[] goldNames) { int predictedNameIdx = 0; @@ -266,7 +265,7 @@ private List determineLabelsForASentence(String[] sentence, Span[] pr var predictedLabel = NO_NE_TAG; if (predictedNameIdx < predictedNames.length) { - Span predictedName = predictedNames[predictedNameIdx]; + var predictedName = predictedNames[predictedNameIdx]; predictedLabel = determineLabel(predictedName, i); if (i > predictedName.getEnd()) { @@ -305,10 +304,15 @@ private String determineLabel(Span aName, int aTokenIdx) } private List extractNameSamples(Iterable aCasses) + { + return extractNameSamplesSlidingWindow(aCasses); + } + + private List extractNameSamplesSentences(Iterable aCasses) { var nameSamples = new ArrayList(); - nextCas: for (CAS cas : aCasses) { + nextCas: for (var cas : aCasses) { var sampleUnitType = getType(cas, SAMPLE_UNIT); var tokenType = getType(cas, Token.class); @@ -325,7 +329,37 @@ private List extractNameSamples(Iterable aCasses) var tokens = cas. select(tokenType).coveredBy(sampleUnit).asList(); var tokenTexts = tokens.stream().map(AnnotationFS::getCoveredText) .toArray(String[]::new); - var annotatedSpans = extractAnnotatedSpans(cas, sampleUnit, tokens); + var annotatedSpans = extractAnnotatedSpans(cas, tokens); + if (annotatedSpans.length == 0) { + continue; + } + + var nameSample = new NameSample(tokenTexts, annotatedSpans, firstSampleInCas); + nameSamples.add(nameSample); + firstSampleInCas = false; + } + } + + return nameSamples; + } + + private List extractNameSamplesSlidingWindow(Iterable aCasses) + { + var nameSamples = new ArrayList(); + + var maxSize = 500; + nextCas: for (var cas : aCasses) { + var firstSampleInCas = true; + var tokens = makeSample(cas, 0, maxSize); + + while (!tokens.isEmpty()) { + if (nameSamples.size() >= traits.getTrainingSetSizeLimit()) { + break nextCas; + } + + var tokenTexts = tokens.stream().map(AnnotationFS::getCoveredText) + .toArray(String[]::new); + var annotatedSpans = extractAnnotatedSpans(cas, tokens); if (annotatedSpans.length == 0) { continue; } @@ -333,20 +367,53 @@ private List extractNameSamples(Iterable aCasses) var nameSample = new NameSample(tokenTexts, annotatedSpans, firstSampleInCas); nameSamples.add(nameSample); firstSampleInCas = false; + + var firstTokenBegin = tokens.get(0).getBegin(); + var lastTokenEnd = tokens.get(tokens.size() - 1).getEnd(); + tokens = makeSample(cas, (lastTokenEnd - firstTokenBegin) / 2, maxSize); } } return nameSamples; } - private Span[] extractAnnotatedSpans(CAS aCas, AnnotationFS aSampleUnit, - Collection aTokens) + private List makeSample(CAS aCas, int aBegin, int aMaxLength) + { + var result = new ArrayList(); + var size = 0; + var i = aCas.select(Token.class).startAt(aBegin).iterator(); + + while (i.hasNext()) { + var token = i.next(); + var tokenText = token.getCoveredText(); + + if (isBlank(tokenText)) { + continue; + } + + size += tokenText.length(); + if (size >= aMaxLength && !result.isEmpty()) { + // Maximum unit size reached + break; + } + result.add(token); + } + + return result; + } + + private Span[] extractAnnotatedSpans(CAS aCas, List aTokens) { - // Create spans from target annotations + if (aTokens.isEmpty()) { + return new Span[0]; + } + + // Collect relevant annotations var annotationType = getType(aCas, layerName); var feature = annotationType.getFeatureByBaseName(featureName); - var annotations = selectCovered(annotationType, aSampleUnit); - + var annotations = aCas. select(annotationType) + .coveredBy(aTokens.get(0).getBegin(), aTokens.get(aTokens.size() - 1).getEnd()) + .asList(); if (annotations.isEmpty()) { return new Span[0]; } @@ -356,10 +423,10 @@ private Span[] extractAnnotatedSpans(CAS aCas, AnnotationFS aSampleUnit, var idxTokenEndOffset = new Int2ObjectOpenHashMap(); var idxToken = new Object2IntOpenHashMap(); var idx = 0; - for (AnnotationFS t : aTokens) { - idxTokenBeginOffset.put(t.getBegin(), t); - idxTokenEndOffset.put(t.getEnd(), t); - idxToken.put(t, idx); + for (var token : aTokens) { + idxTokenBeginOffset.put(token.getBegin(), token); + idxTokenEndOffset.put(token.getEnd(), token); + idxToken.put(token, idx); idx++; } diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderFactory.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderFactory.java index 8a2f1578dd8..97b4307e828 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderFactory.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderFactory.java @@ -21,15 +21,16 @@ */ package de.tudarmstadt.ukp.inception.recommendation.imls.opennlp.ner; +import static de.tudarmstadt.ukp.clarin.webanno.model.AnchoringMode.SENTENCES; import static de.tudarmstadt.ukp.clarin.webanno.model.AnchoringMode.SINGLE_TOKEN; import static de.tudarmstadt.ukp.clarin.webanno.model.AnchoringMode.TOKENS; -import static de.tudarmstadt.ukp.inception.support.WebAnnoConst.SPAN_TYPE; import static java.util.Arrays.asList; import org.apache.uima.cas.CAS; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer; +import de.tudarmstadt.ukp.inception.annotation.layer.span.SpanLayerSupport; import de.tudarmstadt.ukp.inception.recommendation.api.model.Recommender; import de.tudarmstadt.ukp.inception.recommendation.api.recommender.RecommendationEngine; import de.tudarmstadt.ukp.inception.recommendation.api.recommender.RecommendationEngineFactoryImplBase; @@ -67,8 +68,8 @@ public boolean accepts(AnnotationLayer aLayer, AnnotationFeature aFeature) return false; } - return (asList(SINGLE_TOKEN, TOKENS).contains(aLayer.getAnchoringMode())) - && !aLayer.isCrossSentence() && SPAN_TYPE.equals(aLayer.getType()) + return (asList(SINGLE_TOKEN, TOKENS, SENTENCES).contains(aLayer.getAnchoringMode())) + && SpanLayerSupport.TYPE.equals(aLayer.getType()) && (CAS.TYPE_NAME_STRING.equals(aFeature.getType()) || aFeature.isVirtualFeature()); } diff --git a/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java b/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java index 13f8e279a54..3199cea60fc 100644 --- a/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java +++ b/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java @@ -28,15 +28,11 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; -import java.util.Collection; import java.util.List; import org.apache.uima.UIMAException; import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionReader; import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.fit.util.JCasUtil; -import org.apache.uima.jcas.JCas; import org.dkpro.core.api.datasets.Dataset; import org.dkpro.core.api.datasets.DatasetFactory; import org.dkpro.core.io.conll.Conll2002Reader; @@ -48,8 +44,6 @@ import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.inception.annotation.storage.CasStorageSession; -import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.DataSplitter; -import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.EvaluationResult; import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.IncrementalSplitter; import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.PercentageBasedSplitter; import de.tudarmstadt.ukp.inception.recommendation.api.model.Recommender; @@ -81,20 +75,21 @@ public void setUp() @Test public void thatTrainingWorks() throws Exception { - OpenNlpNerRecommender sut = new OpenNlpNerRecommender(recommender, traits); - List casList = loadDevelopmentData(); + var sut = new OpenNlpNerRecommender(recommender, traits); + var casList = loadDevelopmentData(); sut.train(context, casList); - assertThat(context.get(OpenNlpNerRecommender.KEY_MODEL)).as("Model has been set") + assertThat(context.get(OpenNlpNerRecommender.KEY_MODEL)) // + .as("Model has been set") // .isPresent(); } @Test public void thatPredictionWorks() throws Exception { - OpenNlpNerRecommender sut = new OpenNlpNerRecommender(recommender, traits); - List casList = loadDevelopmentData(); + var sut = new OpenNlpNerRecommender(recommender, traits); + var casList = loadDevelopmentData(); CAS cas = casList.get(0); try (CasStorageSession session = CasStorageSession.open()) { @@ -106,7 +101,7 @@ public void thatPredictionWorks() throws Exception sut.predict(new PredictionContext(context), cas); - Collection predictions = JCasUtil.select(cas.getJCas(), NamedEntity.class); + var predictions = cas.select(NamedEntity.class).asList(); assertThat(predictions).as("Predictions have been written to CAS").isNotEmpty(); } @@ -114,44 +109,44 @@ public void thatPredictionWorks() throws Exception @Test public void thatEvaluationWorks() throws Exception { - DataSplitter splitStrategy = new PercentageBasedSplitter(0.8, 10); - OpenNlpNerRecommender sut = new OpenNlpNerRecommender(recommender, traits); - List casList = loadDevelopmentData(); + var splitStrategy = new PercentageBasedSplitter(0.8, 10); + var sut = new OpenNlpNerRecommender(recommender, traits); + var casList = loadDevelopmentData(); - EvaluationResult result = sut.evaluate(casList, splitStrategy); + var result = sut.evaluate(casList, splitStrategy); - double fscore = result.computeF1Score(); - double accuracy = result.computeAccuracyScore(); - double precision = result.computePrecisionScore(); - double recall = result.computeRecallScore(); + var fscore = result.computeF1Score(); + var accuracy = result.computeAccuracyScore(); + var precision = result.computePrecisionScore(); + var recall = result.computeRecallScore(); System.out.printf("F1-Score: %f%n", fscore); System.out.printf("Accuracy: %f%n", accuracy); System.out.printf("Precision: %f%n", precision); System.out.printf("Recall: %f%n", recall); - assertThat(fscore).isStrictlyBetween(0.0, 1.0); - assertThat(precision).isStrictlyBetween(0.0, 1.0); - assertThat(recall).isStrictlyBetween(0.0, 1.0); - assertThat(accuracy).isStrictlyBetween(0.0, 1.0); + assertThat(fscore).isBetween(0.0, 1.0); + assertThat(precision).isBetween(0.0, 1.0); + assertThat(recall).isBetween(0.0, 1.0); + assertThat(accuracy).isBetween(0.0, 1.0); } @Test public void thatIncrementalNerEvaluationWorks() throws Exception { - IncrementalSplitter splitStrategy = new IncrementalSplitter(0.8, 250, 10); - OpenNlpNerRecommender sut = new OpenNlpNerRecommender(recommender, traits); - List casList = loadAllData(); + var splitStrategy = new IncrementalSplitter(0.8, 250, 10); + var sut = new OpenNlpNerRecommender(recommender, traits); + var casList = loadAllData(); - int i = 0; + var i = 0; while (splitStrategy.hasNext() && i < 3) { splitStrategy.next(); - double score = sut.evaluate(casList, splitStrategy).computeF1Score(); + var score = sut.evaluate(casList, splitStrategy).computeF1Score(); System.out.printf("Score: %f%n", score); - assertThat(score).isStrictlyBetween(0.0, 1.0); + assertThat(score).isBetween(0.0, 1.0); i++; } @@ -160,7 +155,7 @@ public void thatIncrementalNerEvaluationWorks() throws Exception private List loadAllData() throws IOException, UIMAException { try { - Dataset ds = loader.load("germeval2014-de", CONTINUE); + var ds = loader.load("germeval2014-de", CONTINUE); return loadData(ds, ds.getDataFiles()); } catch (Exception e) { @@ -173,7 +168,7 @@ private List loadAllData() throws IOException, UIMAException private List loadDevelopmentData() throws IOException, UIMAException { try { - Dataset ds = loader.load("germeval2014-de", CONTINUE); + var ds = loader.load("germeval2014-de", CONTINUE); return loadData(ds, ds.getDefaultSplit().getDevelopmentFiles()); } catch (Exception e) { @@ -185,7 +180,7 @@ private List loadDevelopmentData() throws IOException, UIMAException private List loadData(Dataset ds, File... files) throws UIMAException, IOException { - CollectionReader reader = createReader( // + var reader = createReader( // Conll2002Reader.class, // Conll2002Reader.PARAM_PATTERNS, files, // Conll2002Reader.PARAM_LANGUAGE, ds.getLanguage(), // @@ -194,9 +189,9 @@ private List loadData(Dataset ds, File... files) throws UIMAException, IOEx Conll2002Reader.PARAM_HAS_HEADER, true, // Conll2002Reader.PARAM_HAS_EMBEDDED_NAMED_ENTITY, true); - List casList = new ArrayList<>(); + var casList = new ArrayList(); while (reader.hasNext()) { - JCas cas = JCasFactory.createJCas(); + var cas = JCasFactory.createJCas(); reader.getNext(cas.getCas()); casList.add(cas.getCas()); } @@ -205,13 +200,13 @@ private List loadData(Dataset ds, File... files) throws UIMAException, IOEx private static Recommender buildRecommender() { - AnnotationLayer layer = new AnnotationLayer(); + var layer = new AnnotationLayer(); layer.setName(NamedEntity.class.getName()); - AnnotationFeature feature = new AnnotationFeature(); + var feature = new AnnotationFeature(); feature.setName("value"); - Recommender recommender = new Recommender(); + var recommender = new Recommender(); recommender.setLayer(layer); recommender.setFeature(feature); From b1ffa850b42c31040025f73ad4ded094e0eda5f2 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 27 Dec 2023 23:05:52 +0100 Subject: [PATCH 2/5] #4399 - Allow OpenNLP Multi-Token Sequence Classifier to work for cross-sentence layers - Reduce window size to 100 chars for the moment - Allow predicting spans with no label - Fix endless loop bug in sliding window sample generation --- .../opennlp/ner/OpenNlpNerRecommender.java | 97 +++++++++++++------ 1 file changed, 68 insertions(+), 29 deletions(-) diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java index 8348f255a00..4330206063c 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java @@ -21,12 +21,14 @@ import static de.tudarmstadt.ukp.inception.rendering.model.Range.rangeCoveringAnnotations; import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.selectOverlapping; import static org.apache.commons.lang3.StringUtils.isBlank; -import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.uima.fit.util.CasUtil.getType; import static org.apache.uima.fit.util.CasUtil.selectCovered; import java.io.IOException; import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Objects; @@ -66,6 +68,7 @@ public class OpenNlpNerRecommender private static final Logger LOG = LoggerFactory.getLogger(OpenNlpNerRecommender.class); private static final String NO_NE_TAG = "O"; + private static final String BLANK_LABEL = "__BLANK_LABEL__"; private static final Class SAMPLE_UNIT = Sentence.class; private static final Class DATAPOINT_UNIT = Token.class; @@ -150,12 +153,14 @@ public Range predict(PredictionContext aContext, CAS aCas, int aBegin, int aEnd) for (var prediction : finder.find(tokens)) { var label = prediction.getType(); - if (NameSample.DEFAULT_TYPE.equals(label)) { - continue; + if (NameSample.DEFAULT_TYPE.equals(label) || BLANK_LABEL.equals(label)) { + label = null; } + int begin = tokenAnnotations.get(prediction.getStart()).getBegin(); int end = tokenAnnotations.get(prediction.getEnd() - 1).getEnd(); var annotation = aCas.createAnnotation(predictedType, begin, end); + annotation.setStringValue(predictedFeature, label); if (scoreFeature != null) { annotation.setDoubleValue(scoreFeature, prediction.getProb()); @@ -347,44 +352,69 @@ private List extractNameSamplesSlidingWindow(Iterable aCasses) { var nameSamples = new ArrayList(); - var maxSize = 500; + var maxSize = 100; nextCas: for (var cas : aCasses) { var firstSampleInCas = true; - var tokens = makeSample(cas, 0, maxSize); + var tokenIterator = cas.select(Token.class).iterator(); + var tokens = makeSample(tokenIterator, new LinkedList(), maxSize, maxSize / 2); while (!tokens.isEmpty()) { if (nameSamples.size() >= traits.getTrainingSetSizeLimit()) { + // Generated maximum number of samples break nextCas; } - var tokenTexts = tokens.stream().map(AnnotationFS::getCoveredText) + var tokenTexts = tokens.stream() // + .map(AnnotationFS::getCoveredText) // .toArray(String[]::new); var annotatedSpans = extractAnnotatedSpans(cas, tokens); - if (annotatedSpans.length == 0) { - continue; + if (annotatedSpans.length > 0) { + var nameSample = new NameSample(tokenTexts, annotatedSpans, firstSampleInCas); + nameSamples.add(nameSample); + firstSampleInCas = false; } - var nameSample = new NameSample(tokenTexts, annotatedSpans, firstSampleInCas); - nameSamples.add(nameSample); - firstSampleInCas = false; - - var firstTokenBegin = tokens.get(0).getBegin(); - var lastTokenEnd = tokens.get(tokens.size() - 1).getEnd(); - tokens = makeSample(cas, (lastTokenEnd - firstTokenBegin) / 2, maxSize); + tokens = makeSample(tokenIterator, tokens, maxSize, maxSize / 2); } } return nameSamples; } - private List makeSample(CAS aCas, int aBegin, int aMaxLength) + private List makeSample(Iterator aFreshTokenIterator, List aTokens, + int aMaxLength, int aOverlap) { - var result = new ArrayList(); + if (!aFreshTokenIterator.hasNext()) { + return Collections.emptyList(); + } + + var result = new LinkedList(); + + // Add tokens overlapping with previous sample var size = 0; - var i = aCas.select(Token.class).startAt(aBegin).iterator(); + if (aOverlap > 0) { + var overlapIterator = result.descendingIterator(); + while (overlapIterator.hasNext()) { + var token = overlapIterator.next(); + var tokenText = token.getCoveredText(); - while (i.hasNext()) { - var token = i.next(); + if (isBlank(tokenText)) { + continue; + } + + size += tokenText.length(); + if (size >= aOverlap && !result.isEmpty()) { + // Overlap size reached + break; + } + result.add(0, token); + } + } + + // Add fresh tokens + var freshTokenAdded = false; + while (aFreshTokenIterator.hasNext()) { + var token = aFreshTokenIterator.next(); var tokenText = token.getCoveredText(); if (isBlank(tokenText)) { @@ -392,11 +422,17 @@ private List makeSample(CAS aCas, int aBegin, int aMaxLength) } size += tokenText.length(); - if (size >= aMaxLength && !result.isEmpty()) { - // Maximum unit size reached + if (size >= aMaxLength && freshTokenAdded) { + // Maximum sample size reached break; } + result.add(token); + freshTokenAdded = true; + } + + if (!freshTokenAdded) { + return Collections.emptyList(); } return result; @@ -411,8 +447,10 @@ private Span[] extractAnnotatedSpans(CAS aCas, List aTok // Collect relevant annotations var annotationType = getType(aCas, layerName); var feature = annotationType.getFeatureByBaseName(featureName); - var annotations = aCas. select(annotationType) - .coveredBy(aTokens.get(0).getBegin(), aTokens.get(aTokens.size() - 1).getEnd()) + var windowBegin = aTokens.get(0).getBegin(); + var windowEnd = aTokens.get(aTokens.size() - 1).getEnd(); + var annotations = aCas. select(annotationType) // + .coveredBy(windowBegin, windowEnd) // .asList(); if (annotations.isEmpty()) { return new Span[0]; @@ -431,11 +469,14 @@ private Span[] extractAnnotatedSpans(CAS aCas, List aTok } var result = new ArrayList(); - var highestEndTokenPositionObserved = 0; + var highestEndTokenPositionObserved = -1; var numberOfAnnotations = annotations.size(); for (int i = 0; i < numberOfAnnotations; i++) { var annotation = annotations.get(i); var label = annotation.getFeatureValueAsString(feature); + if (isBlank(label)) { + label = BLANK_LABEL; + } var beginToken = idxTokenBeginOffset.get(annotation.getBegin()); var endToken = idxTokenEndOffset.get(annotation.getEnd()); @@ -456,10 +497,8 @@ private Span[] extractAnnotatedSpans(CAS aCas, List aTok continue; } - if (isNotBlank(label)) { - result.add(new Span(begin, end + 1, label)); - highestEndTokenPositionObserved = end + 1; - } + result.add(new Span(begin, end + 1, label)); + highestEndTokenPositionObserved = end + 1; } return result.toArray(new Span[result.size()]); From f0f4f5d3a5e87c388cae612e2c04f589bce282f9 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Wed, 27 Dec 2023 23:06:21 +0100 Subject: [PATCH 3/5] #4399 - Allow OpenNLP Multi-Token Sequence Classifier to work for cross-sentence layers - Use var in some places --- .../ukp/inception/support/uima/WebAnnoCasUtil.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/WebAnnoCasUtil.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/WebAnnoCasUtil.java index e4f540396aa..4c940139475 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/WebAnnoCasUtil.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/WebAnnoCasUtil.java @@ -271,9 +271,9 @@ public static List selectOverlapping(CAS aCas, Type aType, int aBe @SuppressWarnings("unchecked") public static T getNext(T aRef) { - CAS cas = aRef.getCAS(); - AnnotationIndex idx = cas.getAnnotationIndex(aRef.getType()); - FSIterator it = idx.iterator(aRef); + var cas = aRef.getCAS(); + var idx = cas.getAnnotationIndex(aRef.getType()); + var it = idx.iterator(aRef); if (!it.isValid()) { return null; @@ -287,7 +287,7 @@ public static T getNext(T aRef) } // Seek left until we hit the last FS that is no longer equal to the current - boolean moved = false; + var moved = false; while (it.isValid() && idx.compare(it.get(), aRef) == 0) { it.moveToPrevious(); moved = true; From 5fbd3d49ab8dd5112997b149824911dd85ff6a0e Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sat, 30 Dec 2023 18:41:14 +0100 Subject: [PATCH 4/5] #4399 - Allow OpenNLP Multi-Token Sequence Classifier to work for cross-sentence layers - Adjust window size depending on document size - Improve API of AnnotationBuilder --- .../webanno/constraints/ComplexTypeTest.java | 8 +- .../ollama/response/MentionsSampleTest.java | 2 +- .../opennlp/ner/OpenNlpNerRecommender.java | 77 ++++++++++++--- .../ner/OpenNlpNerRecommenderTraits.java | 17 +++- .../ner/OpenNlpNerRecommenderTest.java | 58 ++++++++++-- .../span/SpanSuggestionExtractionTest.java | 2 +- .../support/uima/AnnotationBuilder.java | 94 ++++++++++++++++++- .../support/uima/SegmentationUtils.java | 6 ++ 8 files changed, 229 insertions(+), 35 deletions(-) diff --git a/inception/inception-constraints/src/test/java/de/tudarmstadt/ukp/clarin/webanno/constraints/ComplexTypeTest.java b/inception/inception-constraints/src/test/java/de/tudarmstadt/ukp/clarin/webanno/constraints/ComplexTypeTest.java index fc7ffd42e34..671e7b313da 100644 --- a/inception/inception-constraints/src/test/java/de/tudarmstadt/ukp/clarin/webanno/constraints/ComplexTypeTest.java +++ b/inception/inception-constraints/src/test/java/de/tudarmstadt/ukp/clarin/webanno/constraints/ComplexTypeTest.java @@ -68,14 +68,14 @@ public void thatSlotFeatureInConditionWorks() throws Exception .withFeature("links", asList( buildFS(cas, "webanno.custom.ComplexLinkType") .withFeature("target", buildAnnotation(cas, "webanno.custom.Span") - .on("ACME") - .withFeature("value", "PER") + .on("ACME") // + .withFeature("value", "PER") // .buildAndAddToIndexes()) .buildWithoutAddingToIndexes(), buildFS(cas, "webanno.custom.ComplexLinkType") .withFeature("target", buildAnnotation(cas, "webanno.custom.Span") - .on("Foobar") - .withFeature("value", "LOC") + .on("Foobar") // + .withFeature("value", "LOC") // .buildAndAddToIndexes()) .buildWithoutAddingToIndexes())) .buildAndAddToIndexes(); diff --git a/inception/inception-imls-ollama/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/ollama/response/MentionsSampleTest.java b/inception/inception-imls-ollama/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/ollama/response/MentionsSampleTest.java index df38ec71981..b1235441ee2 100644 --- a/inception/inception-imls-ollama/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/ollama/response/MentionsSampleTest.java +++ b/inception/inception-imls-ollama/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/ollama/response/MentionsSampleTest.java @@ -78,7 +78,7 @@ void testGenerateExamples() { String text = "John likes Mary."; cas.setDocumentText(text); - buildAnnotation(cas, Sentence.class).on(text) // + buildAnnotation(cas, Sentence.class).onMatch(text) // .buildAndAddToIndexes(); buildAnnotation(cas, NamedEntity.class).on("John") // .withFeature(NamedEntity._FeatName_value, "PER") // diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java index 3f261de5f55..36c54f34be1 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java @@ -66,14 +66,17 @@ public class OpenNlpNerRecommender extends RecommendationEngine { public static final Key KEY_MODEL = new Key<>("opennlp_ner_model"); + private static final Logger LOG = LoggerFactory.getLogger(OpenNlpNerRecommender.class); private static final String NO_NE_TAG = "O"; - private static final String BLANK_LABEL = "__BLANK_LABEL__"; private static final Class SAMPLE_UNIT = Sentence.class; private static final Class DATAPOINT_UNIT = Token.class; + private static final int DEFAULT_WINDOW_SIZE = 300; + private static final int MIN_WINDOW_SIZE = 30; + private static final int MIN_TRAINING_SET_SIZE = 2; private static final int MIN_TEST_SET_SIZE = 2; @@ -95,7 +98,7 @@ public boolean isReadyForPrediction(RecommenderContext aContext) @Override public void train(RecommenderContext aContext, List aCasses) throws RecommendationException { - var nameSamples = extractNameSamples(aCasses); + var nameSamples = extractSamples(aCasses); if (nameSamples.size() < 2) { aContext.log(LogMessage.warn(getRecommender().getName(), @@ -143,9 +146,11 @@ public Range predict(PredictionContext aContext, CAS aCas, int aBegin, int aEnd) var predictionCount = 0; for (var unit : units) { - if (predictionCount >= traits.getPredictionLimit()) { + int predictionsLimit = traits.getPredictionLimit(); + if (predictionsLimit > 0 && predictionCount >= predictionsLimit) { break; } + predictionCount++; var tokenAnnotations = selectCovered(tokenType, unit); @@ -181,14 +186,17 @@ public Range predict(PredictionContext aContext, CAS aCas, int aBegin, int aEnd) @Override public int estimateSampleCount(List aCasses) { - return extractNameSamples(aCasses).size(); + return extractSamples(aCasses).size(); } @Override public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) throws RecommendationException { - var data = extractNameSamples(aCasses); + // We use sentence-based samples here even if the layer allows cross-sentence annotations + // because with the overlapping sliding window, the evaluation would otherwise train on test + // data. + var data = extractSamplesFromSentences(aCasses); var trainingSet = new ArrayList(); var testSet = new ArrayList(); @@ -212,7 +220,7 @@ public EvaluationResult evaluate(List aCasses, DataSplitter aDataSplitter) var trainRatio = (overallTrainingSize > 0) ? trainingSetSize / overallTrainingSize : 0.0; if (trainingSetSize < MIN_TRAINING_SET_SIZE || testSetSize < MIN_TEST_SET_SIZE) { - String msg = String.format( + var msg = String.format( "Not enough evaluation data: training set size [%d] (min. %d), test set size [%d] (min. %d) of total [%d] (min. %d)", trainingSetSize, MIN_TRAINING_SET_SIZE, testSetSize, MIN_TEST_SET_SIZE, data.size(), (MIN_TRAINING_SET_SIZE + MIN_TEST_SET_SIZE)); @@ -310,12 +318,17 @@ private String determineLabel(Span aName, int aTokenIdx) return label; } - private List extractNameSamples(Iterable aCasses) + private List extractSamples(Iterable aCasses) { - return extractNameSamplesSlidingWindow(aCasses); + if (getRecommender().getLayer().isCrossSentence()) { + return extractSamplesUsingSlidingWindow(aCasses); + } + else { + return extractSamplesFromSentences(aCasses); + } } - private List extractNameSamplesSentences(Iterable aCasses) + private List extractSamplesFromSentences(Iterable aCasses) { var nameSamples = new ArrayList(); @@ -325,7 +338,8 @@ private List extractNameSamplesSentences(Iterable aCasses) var firstSampleInCas = true; for (var sampleUnit : cas. select(sampleUnitType)) { - if (nameSamples.size() >= traits.getTrainingSetSizeLimit()) { + int trainingSetSizeLimit = traits.getTrainingSetSizeLimit(); + if (trainingSetSizeLimit > 0 && nameSamples.size() >= trainingSetSizeLimit) { break nextCas; } @@ -350,18 +364,22 @@ private List extractNameSamplesSentences(Iterable aCasses) return nameSamples; } - private List extractNameSamplesSlidingWindow(Iterable aCasses) + private List extractSamplesUsingSlidingWindow(Iterable aCasses) { var nameSamples = new ArrayList(); - var maxSize = 100; nextCas: for (var cas : aCasses) { + var windowSize = getWindowSize(cas); + var windowOverlap = windowSize / 2; + var firstSampleInCas = true; var tokenIterator = cas.select(Token.class).iterator(); - var tokens = makeSample(tokenIterator, new LinkedList(), maxSize, maxSize / 2); + var tokens = makeSample(tokenIterator, new LinkedList(), windowSize, + windowOverlap); while (!tokens.isEmpty()) { - if (nameSamples.size() >= traits.getTrainingSetSizeLimit()) { + int trainingSetSizeLimit = traits.getTrainingSetSizeLimit(); + if (trainingSetSizeLimit > 0 && nameSamples.size() >= trainingSetSizeLimit) { // Generated maximum number of samples break nextCas; } @@ -376,13 +394,42 @@ private List extractNameSamplesSlidingWindow(Iterable aCasses) firstSampleInCas = false; } - tokens = makeSample(tokenIterator, tokens, maxSize, maxSize / 2); + tokens = makeSample(tokenIterator, tokens, windowSize, windowOverlap); } } return nameSamples; } + private int getWindowSize(CAS aCas) + { + int textLengh = aCas.getDocumentText().length(); + + int windowSize = traits.getWindowSize(); + if (windowSize <= 0) { + windowSize = DEFAULT_WINDOW_SIZE; + } + + // If the document is short try scaling down the window size to get a + // few more samples. + int minDesiredSamples = 10; + if (windowSize * minDesiredSamples > textLengh) { + windowSize = textLengh / minDesiredSamples; + } + + // If the document is too short to accommodate the minimum training set size + // with the current window size, scale the window size down. + if (windowSize * MIN_TRAINING_SET_SIZE > textLengh) { + windowSize = textLengh / MIN_TRAINING_SET_SIZE; + } + + if (windowSize < MIN_WINDOW_SIZE) { + windowSize = MIN_WINDOW_SIZE; + } + + return windowSize; + } + private List makeSample(Iterator aFreshTokenIterator, List aTokens, int aMaxLength, int aOverlap) { diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTraits.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTraits.java index 136dab4752f..7f185f84573 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTraits.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTraits.java @@ -21,6 +21,8 @@ import com.fasterxml.jackson.annotation.JsonIgnore; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonInclude; +import com.fasterxml.jackson.annotation.JsonInclude.Include; import opennlp.tools.util.TrainingParameters; @@ -30,8 +32,9 @@ public class OpenNlpNerRecommenderTraits { private static final long serialVersionUID = 7717316701623340670L; - private int trainingSetSizeLimit = Integer.MAX_VALUE; - private int predictionLimit = Integer.MAX_VALUE; + private @JsonInclude(Include.NON_DEFAULT) int trainingSetSizeLimit = 0; + private @JsonInclude(Include.NON_DEFAULT) int predictionLimit = 0; + private @JsonInclude(Include.NON_DEFAULT) int windowSize = 0; private int numThreads = 1; @@ -65,6 +68,16 @@ public void setPredictionLimit(int aPredictionLimit) predictionLimit = aPredictionLimit; } + public void setWindowSize(int aWindowSize) + { + windowSize = aWindowSize; + } + + public int getWindowSize() + { + return windowSize; + } + @JsonIgnore public TrainingParameters getParameters() { diff --git a/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java b/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java index e1cdb3d79a5..edacd9e9543 100644 --- a/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java +++ b/inception/inception-imls-opennlp/src/test/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommenderTest.java @@ -18,6 +18,7 @@ package de.tudarmstadt.ukp.inception.recommendation.imls.opennlp.ner; import static de.tudarmstadt.ukp.clarin.webanno.api.casstorage.CasAccessMode.EXCLUSIVE_WRITE_ACCESS; +import static de.tudarmstadt.ukp.inception.support.uima.AnnotationBuilder.buildAnnotation; import static java.util.Arrays.asList; import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; import static org.assertj.core.api.Assertions.assertThat; @@ -43,6 +44,7 @@ import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationFeature; import de.tudarmstadt.ukp.clarin.webanno.model.AnnotationLayer; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.inception.annotation.storage.CasStorageSession; import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.IncrementalSplitter; import de.tudarmstadt.ukp.inception.recommendation.api.evaluation.PercentageBasedSplitter; @@ -51,6 +53,7 @@ import de.tudarmstadt.ukp.inception.recommendation.api.recommender.RecommenderContext; import de.tudarmstadt.ukp.inception.support.test.recommendation.DkproTestHelper; import de.tudarmstadt.ukp.inception.support.test.recommendation.RecommenderTestHelper; +import de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils; public class OpenNlpNerRecommenderTest { @@ -85,6 +88,50 @@ public void thatTrainingWorks() throws Exception .isPresent(); } + @Test + public void thatTrainingWorksCrossSentenceWithSimpleExample() throws Exception + { + var cas = JCasFactory.createJCas(); + cas.setDocumentText(""" + I like noodles. + I guess St. John is good. + Does St. John like noodles? + """); + + SegmentationUtils.segment(cas.getCas()); + + assertThat(cas.select(Token.class).asList()) // + .map(Token::getCoveredText) // + .containsAll(asList("St", ".", "John")) // + .doesNotContain("St.", "St. John"); + + buildAnnotation(cas, NamedEntity.class).onAll("St. John").buildAllAndAddToIndexes(); + var casList = asList(cas.getCas()); + + recommender.getLayer().setCrossSentence(true); + var sut = new OpenNlpNerRecommender(recommender, traits); + + sut.train(context, casList); + + assertThat(context.get(OpenNlpNerRecommender.KEY_MODEL)) // + .as("Model has been set") // + .isPresent(); + } + + @Test + public void thatTrainingWorksCrossSentence() throws Exception + { + recommender.getLayer().setCrossSentence(true); + var sut = new OpenNlpNerRecommender(recommender, traits); + + var casList = loadDevelopmentData(); + sut.train(context, casList); + + assertThat(context.get(OpenNlpNerRecommender.KEY_MODEL)) // + .as("Model has been set") // + .isPresent(); + } + @Test public void thatPredictionWorks() throws Exception { @@ -228,15 +275,12 @@ private List loadData(Dataset ds, File... files) throws UIMAException, IOEx private static Recommender buildRecommender() { - var layer = new AnnotationLayer(); - layer.setName(NamedEntity.class.getName()); + var layer = AnnotationLayer.builder().forJCasClass(NamedEntity.class).build(); - var feature = new AnnotationFeature(); - feature.setName("value"); + var feature = AnnotationFeature.builder().withLayer(layer) + .withName(NamedEntity._FeatName_value).build(); - var recommender = new Recommender(); - recommender.setLayer(layer); - recommender.setFeature(feature); + var recommender = Recommender.builder().withLayer(layer).withFeature(feature).build(); return recommender; } diff --git a/inception/inception-recommendation/src/test/java/de/tudarmstadt/ukp/inception/recommendation/span/SpanSuggestionExtractionTest.java b/inception/inception-recommendation/src/test/java/de/tudarmstadt/ukp/inception/recommendation/span/SpanSuggestionExtractionTest.java index cacd3d87f8c..733486b85eb 100644 --- a/inception/inception-recommendation/src/test/java/de/tudarmstadt/ukp/inception/recommendation/span/SpanSuggestionExtractionTest.java +++ b/inception/inception-recommendation/src/test/java/de/tudarmstadt/ukp/inception/recommendation/span/SpanSuggestionExtractionTest.java @@ -127,7 +127,7 @@ void testSpanExtraction() throws Exception var predictionCas = RecommenderTypeSystemUtils.makePredictionCas(originalCas, aFeatures); buildAnnotation(predictionCas, feature.getLayer().getName()) // - .on("\\bis\\b") // + .onMatch("\\bis\\b") // .withFeature(feature.getName(), "verb") // .withFeature(FEATURE_NAME_IS_PREDICTION, true) // .buildAndAddToIndexes(); diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/AnnotationBuilder.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/AnnotationBuilder.java index 910eef2d0b2..52811c3e28d 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/AnnotationBuilder.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/AnnotationBuilder.java @@ -17,6 +17,10 @@ */ package de.tudarmstadt.ukp.inception.support.uima; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -24,11 +28,15 @@ import org.apache.uima.cas.Type; import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.fit.util.CasUtil; +import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; public class AnnotationBuilder extends FeatureStructureBuilder { + private record Range(int begin, int end) {} + + private final Set ranges = new LinkedHashSet<>(); public AnnotationBuilder(CAS aCas, Type aType) { @@ -37,19 +45,80 @@ public AnnotationBuilder(CAS aCas, Type aType) public AnnotationBuilder at(AnnotationFS aAnnotation) { - withFeature(CAS.FEATURE_BASE_NAME_BEGIN, aAnnotation.getBegin()); - withFeature(CAS.FEATURE_BASE_NAME_END, aAnnotation.getEnd()); + ranges.add(new Range(aAnnotation.getBegin(), aAnnotation.getEnd())); return this; } public AnnotationBuilder at(int aBegin, int aEnd) { - withFeature(CAS.FEATURE_BASE_NAME_BEGIN, aBegin); - withFeature(CAS.FEATURE_BASE_NAME_END, aEnd); + ranges.add(new Range(aBegin, aEnd)); return this; } - public AnnotationBuilder on(String aPattern) + public List buildAllAndAddToIndexes() + { + var annotations = buildAllWithoutAddingToIndexes(); + annotations.forEach(getCas()::addFsToIndexes); + return annotations; + } + + public List buildAllWithoutAddingToIndexes() + { + var annotations = new ArrayList(); + + for (var range : ranges) { + var ann = super.buildWithoutAddingToIndexes(); + ann.setBegin(range.begin); + ann.setEnd(range.end); + annotations.add(ann); + } + + return annotations; + } + + @Override + public T buildAndAddToIndexes() + { + if (ranges.size() > 1) { + throw new IllegalStateException( + "Use buildAllAndAddToIndexes() when multiple ranges have been specified."); + } + + ranges.forEach(range -> { + withFeature(CAS.FEATURE_BASE_NAME_BEGIN, range.begin); + withFeature(CAS.FEATURE_BASE_NAME_END, range.end); + }); + + return super.buildAndAddToIndexes(); + } + + @Override + public T buildWithoutAddingToIndexes() + { + if (ranges.size() > 1) { + throw new IllegalStateException( + "Use buildAllWithoutAddingToIndexes() when multiple ranges have been specified."); + } + + ranges.forEach(range -> { + withFeature(CAS.FEATURE_BASE_NAME_BEGIN, range.begin); + withFeature(CAS.FEATURE_BASE_NAME_END, range.end); + }); + + return super.buildWithoutAddingToIndexes(); + } + + public AnnotationBuilder on(String aText) + { + return onMatch(Pattern.quote(aText)); + } + + public AnnotationBuilder onAll(String aText) + { + return onAllMatches(Pattern.quote(aText)); + } + + public AnnotationBuilder onMatch(String aPattern) { Matcher m = Pattern.compile(aPattern).matcher(getCas().getDocumentText()); if (m.find()) { @@ -58,11 +127,26 @@ public AnnotationBuilder on(String aPattern) return this; } + public AnnotationBuilder onAllMatches(String aPattern) + { + Matcher m = Pattern.compile(aPattern).matcher(getCas().getDocumentText()); + while (m.find()) { + at(m.start(), m.end()); + } + return this; + } + public static AnnotationBuilder buildAnnotation(CAS aCas, Type aType) { return new AnnotationBuilder(aCas, aType); } + public static AnnotationBuilder buildAnnotation(JCas aCas, + Class aType) + { + return new AnnotationBuilder(aCas.getCas(), aCas.getCasType(aType)); + } + public static AnnotationBuilder buildAnnotation(CAS aCas, Class aType) { diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java index 7ed17356b9c..faa811d7261 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java @@ -36,6 +36,12 @@ private SegmentationUtils() // No instances } + public static void segment(CAS aCas) + { + splitSentences(aCas, null); + tokenize(aCas); + } + public static void splitSentences(CAS aCas) { splitSentences(aCas, null); From d4582b0f39b3b967e1f07d717181e4f33965d399 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 31 Dec 2023 13:01:54 +0100 Subject: [PATCH 5/5] #4399 - Allow OpenNLP Multi-Token Sequence Classifier to work for cross-sentence layers - Better log messages --- .../imls/opennlp/ner/OpenNlpNerRecommender.java | 15 +++++++++------ .../tasks/RecommendationTask_ImplBase.java | 5 +++++ .../recommendation/tasks/TrainingTask.java | 13 ++++++++----- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java index 36c54f34be1..21f230f136a 100644 --- a/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java +++ b/inception/inception-imls-opennlp/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/opennlp/ner/OpenNlpNerRecommender.java @@ -98,7 +98,7 @@ public boolean isReadyForPrediction(RecommenderContext aContext) @Override public void train(RecommenderContext aContext, List aCasses) throws RecommendationException { - var nameSamples = extractSamples(aCasses); + var nameSamples = extractSamples(aContext, aCasses); if (nameSamples.size() < 2) { aContext.log(LogMessage.warn(getRecommender().getName(), @@ -186,7 +186,7 @@ public Range predict(PredictionContext aContext, CAS aCas, int aBegin, int aEnd) @Override public int estimateSampleCount(List aCasses) { - return extractSamples(aCasses).size(); + return extractSamples(null, aCasses).size(); } @Override @@ -318,14 +318,17 @@ private String determineLabel(Span aName, int aTokenIdx) return label; } - private List extractSamples(Iterable aCasses) + private List extractSamples(RecommenderContext aContext, Iterable aCasses) { if (getRecommender().getLayer().isCrossSentence()) { + if (aContext != null) { + aContext.log(LogMessage.info(getRecommender().getName(), + "Training using sliding-window since layer permits cross-sentence annotations.")); + } return extractSamplesUsingSlidingWindow(aCasses); } - else { - return extractSamplesFromSentences(aCasses); - } + + return extractSamplesFromSentences(aCasses); } private List extractSamplesFromSentences(Iterable aCasses) diff --git a/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/RecommendationTask_ImplBase.java b/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/RecommendationTask_ImplBase.java index e1e19a1fd1b..e4258deb5a6 100644 --- a/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/RecommendationTask_ImplBase.java +++ b/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/RecommendationTask_ImplBase.java @@ -71,4 +71,9 @@ public void error(String aFormat, Object... aValues) { logMessages.add(LogMessage.error(this, aFormat, aValues)); } + + public void log(LogMessage aMessage) + { + logMessages.add(aMessage); + } } diff --git a/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/TrainingTask.java b/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/TrainingTask.java index 76d8e3385cf..8c1413f0510 100644 --- a/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/TrainingTask.java +++ b/inception/inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/tasks/TrainingTask.java @@ -406,8 +406,9 @@ private void logTrainingSuccessful(User user, LazyInitializer