#4381 - Allow users to browse their past activity

- Extract segmentation utils into separate reusable class - Added tests for extraction of span suggestion and document metadata suggestions
inception-project · Dec 17, 2023 · 5ec0a1a · 5ec0a1a
1 parent 11c5211
commit 5ec0a1a
Show file tree

Hide file tree

Showing 6 changed files with 354 additions and 145 deletions.
diff --git a/...rt/src/main/java/de/tudarmstadt/ukp/inception/export/DocumentImportExportServiceImpl.java b/...rt/src/main/java/de/tudarmstadt/ukp/inception/export/DocumentImportExportServiceImpl.java
@@ -17,11 +17,8 @@
  */
 package de.tudarmstadt.ukp.inception.export;
 
-import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createSentence;
-import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createToken;
 import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.exists;
 import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getRealCas;
-import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.selectSentences;
 import static de.tudarmstadt.ukp.clarin.webanno.api.casstorage.CasAccessMode.EXCLUSIVE_WRITE_ACCESS;
 import static de.tudarmstadt.ukp.clarin.webanno.api.casstorage.CasAccessMode.UNMANAGED_ACCESS;
 import static de.tudarmstadt.ukp.inception.project.api.ProjectService.DOCUMENT_FOLDER;
@@ -46,13 +43,11 @@
 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.nio.file.Files;
-import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
-import java.util.Locale;
 import java.util.Map;
 
 import org.apache.commons.lang3.ClassUtils;
@@ -63,7 +58,6 @@
 import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.cas.Type;
-import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.slf4j.Logger;
@@ -101,7 +95,6 @@
 import de.tudarmstadt.ukp.inception.schema.api.AnnotationSchemaService;
 import de.tudarmstadt.ukp.inception.support.logging.BaseLoggers;
 import de.tudarmstadt.ukp.inception.support.logging.LogMessage;
-import it.unimi.dsi.fastutil.ints.IntArrayList;
 
 /**
  * <p>
@@ -385,7 +378,7 @@ private void splitTokensIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat)
         Type tokenType = getType(cas, Token.class);
 
         if (!exists(cas, tokenType)) {
-            tokenize(cas);
+            SegmentationUtils.tokenize(cas);
         }
 
         if (properties.getMaxTokens() > 0) {
@@ -410,7 +403,7 @@ private void splitSenencesIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat
         Type sentenceType = getType(cas, Sentence.class);
 
         if (!exists(cas, sentenceType)) {
-            splitSentences(cas);
+            SegmentationUtils.splitSentences(cas);
         }
 
         if (properties.getMaxSentences() > 0) {
@@ -429,133 +422,6 @@ private void splitSenencesIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat
         }
     }
 
-    public static void splitSentences(CAS aCas)
-    {
-        splitSentences(aCas, null);
-    }
-
-    public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
-    {
-        if (aCas.getDocumentText() == null) {
-            return;
-        }
-
-        int[] sortedZoneBoundaries = null;
-
-        if (aZones != null) {
-            var zoneBoundaries = new IntArrayList();
-            for (var zone : aZones) {
-                zoneBoundaries.add(zone.getBegin());
-                zoneBoundaries.add(zone.getEnd());
-            }
-
-            sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
-        }
-
-        if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
-            sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
-        }
-
-        for (int i = 1; i < sortedZoneBoundaries.length; i++) {
-            var begin = sortedZoneBoundaries[i - 1];
-            var end = sortedZoneBoundaries[i];
-            BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
-            bi.setText(aCas.getDocumentText().substring(begin, end));
-            int last = bi.first();
-            int cur = bi.next();
-            while (cur != BreakIterator.DONE) {
-                int[] span = new int[] { last + begin, cur + begin };
-                trim(aCas.getDocumentText(), span);
-                if (!isEmpty(span[0], span[1])) {
-                    aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
-                }
-                last = cur;
-                cur = bi.next();
-            }
-        }
-    }
-
-    public static void tokenize(CAS aCas)
-    {
-        if (aCas.getDocumentText() == null) {
-            return;
-        }
-
-        BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
-        for (AnnotationFS s : selectSentences(aCas)) {
-            bi.setText(s.getCoveredText());
-            int last = bi.first();
-            int cur = bi.next();
-            while (cur != BreakIterator.DONE) {
-                int[] span = new int[] { last, cur };
-                trim(s.getCoveredText(), span);
-                if (!isEmpty(span[0], span[1])) {
-                    aCas.addFsToIndexes(
-                            createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
-                }
-                last = cur;
-                cur = bi.next();
-            }
-        }
-    }
-
-    /**
-     * Remove trailing or leading whitespace from the annotation.
-     * 
-     * @param aText
-     *            the text.
-     * @param aSpan
-     *            the offsets.
-     */
-    public static void trim(String aText, int[] aSpan)
-    {
-        String data = aText;
-
-        int begin = aSpan[0];
-        int end = aSpan[1] - 1;
-
-        // Remove whitespace at end
-        while ((end > 0) && trimChar(data.charAt(end))) {
-            end--;
-        }
-        end++;
-
-        // Remove whitespace at start
-        while ((begin < end) && trimChar(data.charAt(begin))) {
-            begin++;
-        }
-
-        aSpan[0] = begin;
-        aSpan[1] = end;
-    }
-
-    public static boolean isEmpty(int aBegin, int aEnd)
-    {
-        return aBegin >= aEnd;
-    }
-
-    public static boolean trimChar(final char aChar)
-    {
-        switch (aChar) {
-        case '\n':
-            return true; // Line break
-        case '\r':
-            return true; // Carriage return
-        case '\t':
-            return true; // Tab
-        case '\u200E':
-            return true; // LEFT-TO-RIGHT MARK
-        case '\u200F':
-            return true; // RIGHT-TO-LEFT MARK
-        case '\u2028':
-            return true; // LINE SEPARATOR
-        case '\u2029':
-            return true; // PARAGRAPH SEPARATOR
-        default:
-            return Character.isWhitespace(aChar);
-        }
-    }
-
     @Override
     public File exportCasToFile(CAS aCas, SourceDocument aDocument, String aFileName,
             FormatSupport aFormat)

diff --git a/...inception-export/src/main/java/de/tudarmstadt/ukp/inception/export/SegmentationUtils.java b/...inception-export/src/main/java/de/tudarmstadt/ukp/inception/export/SegmentationUtils.java
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The Technische Universität Darmstadt 
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *  
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.tudarmstadt.ukp.inception.export;
+
+import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createSentence;
+import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createToken;
+import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.selectSentences;
+
+import java.text.BreakIterator;
+import java.util.Locale;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.text.AnnotationFS;
+
+import it.unimi.dsi.fastutil.ints.IntArrayList;
+
+public abstract class SegmentationUtils
+{
+    private SegmentationUtils()
+    {
+        // No instances
+    }
+
+    public static void splitSentences(CAS aCas)
+    {
+        splitSentences(aCas, null);
+    }
+
+    public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
+    {
+        if (aCas.getDocumentText() == null) {
+            return;
+        }
+
+        int[] sortedZoneBoundaries = null;
+
+        if (aZones != null) {
+            var zoneBoundaries = new IntArrayList();
+            for (var zone : aZones) {
+                zoneBoundaries.add(zone.getBegin());
+                zoneBoundaries.add(zone.getEnd());
+            }
+
+            sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
+        }
+
+        if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
+            sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
+        }
+
+        for (int i = 1; i < sortedZoneBoundaries.length; i++) {
+            var begin = sortedZoneBoundaries[i - 1];
+            var end = sortedZoneBoundaries[i];
+            BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
+            bi.setText(aCas.getDocumentText().substring(begin, end));
+            int last = bi.first();
+            int cur = bi.next();
+            while (cur != BreakIterator.DONE) {
+                int[] span = new int[] { last + begin, cur + begin };
+                trim(aCas.getDocumentText(), span);
+                if (!isEmpty(span[0], span[1])) {
+                    aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
+                }
+                last = cur;
+                cur = bi.next();
+            }
+        }
+    }
+
+    public static void tokenize(CAS aCas)
+    {
+        if (aCas.getDocumentText() == null) {
+            return;
+        }
+
+        BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
+        for (AnnotationFS s : selectSentences(aCas)) {
+            bi.setText(s.getCoveredText());
+            int last = bi.first();
+            int cur = bi.next();
+            while (cur != BreakIterator.DONE) {
+                int[] span = new int[] { last, cur };
+                trim(s.getCoveredText(), span);
+                if (!isEmpty(span[0], span[1])) {
+                    aCas.addFsToIndexes(
+                            createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
+                }
+                last = cur;
+                cur = bi.next();
+            }
+        }
+    }
+
+    /**
+     * Remove trailing or leading whitespace from the annotation.
+     * 
+     * @param aText
+     *            the text.
+     * @param aSpan
+     *            the offsets.
+     */
+    public static void trim(String aText, int[] aSpan)
+    {
+        String data = aText;
+
+        int begin = aSpan[0];
+        int end = aSpan[1] - 1;
+
+        // Remove whitespace at end
+        while ((end > 0) && trimChar(data.charAt(end))) {
+            end--;
+        }
+        end++;
+
+        // Remove whitespace at start
+        while ((begin < end) && trimChar(data.charAt(begin))) {
+            begin++;
+        }
+
+        aSpan[0] = begin;
+        aSpan[1] = end;
+    }
+
+    public static boolean isEmpty(int aBegin, int aEnd)
+    {
+        return aBegin >= aEnd;
+    }
+
+    public static boolean trimChar(final char aChar)
+    {
+        switch (aChar) {
+        case '\n':
+            return true; // Line break
+        case '\r':
+            return true; // Carriage return
+        case '\t':
+            return true; // Tab
+        case '\u200E':
+            return true; // LEFT-TO-RIGHT MARK
+        case '\u200F':
+            return true; // RIGHT-TO-LEFT MARK
+        case '\u2028':
+            return true; // LINE SEPARATOR
+        case '\u2029':
+            return true; // PARAGRAPH SEPARATOR
+        default:
+            return Character.isWhitespace(aChar);
+        }
+    }
+
+}
diff --git a/...kp/inception/export/SegmentationTest.java → ...ception/export/SegmentationUtilsTest.java b/...kp/inception/export/SegmentationTest.java → ...ception/export/SegmentationUtilsTest.java
@@ -31,14 +31,14 @@
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
 
-public class SegmentationTest
+public class SegmentationUtilsTest
 {
     @Test
     public void testSplitSentences() throws Exception
     {
         JCas jcas = JCasFactory.createText("I am one. I am two.", "en");
 
-        DocumentImportExportServiceImpl.splitSentences(jcas.getCas());
+        SegmentationUtils.splitSentences(jcas.getCas());
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("I am one.", "I am two.");
@@ -51,7 +51,7 @@ public void testSplitSentencesWithZones() throws Exception
         new Heading(jcas, 0, 7).addToIndexes();
         new Paragraph(jcas, 8, 17).addToIndexes();
 
-        DocumentImportExportServiceImpl.splitSentences(jcas.getCas(), jcas.select(Div.class));
+        SegmentationUtils.splitSentences(jcas.getCas(), jcas.select(Div.class));
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("Heading", "I am two.");
@@ -64,7 +64,7 @@ public void testTokenize() throws Exception
         new Sentence(jcas, 0, 9).addToIndexes();
         new Sentence(jcas, 9, 18).addToIndexes();
 
-        DocumentImportExportServiceImpl.tokenize(jcas.getCas());
+        SegmentationUtils.tokenize(jcas.getCas());
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("i am one.", "i am two.");