#5043 - Ability to specify token breaking zones when calling tokenizer

- Added new signature to the tokenizer call - Added test - Consolicated existing code
inception-project · Sep 7, 2024 · 41b137d · 41b137d
1 parent 79d0e49
commit 41b137d
Show file tree

Hide file tree

Showing 2 changed files with 101 additions and 59 deletions.
diff --git a/...ption-export/src/test/java/de/tudarmstadt/ukp/inception/export/SegmentationUtilsTest.java b/...ption-export/src/test/java/de/tudarmstadt/ukp/inception/export/SegmentationUtilsTest.java
@@ -17,28 +17,29 @@
  */
 package de.tudarmstadt.ukp.inception.export;
 
+import static de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils.splitSentences;
+import static de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils.tokenize;
+import static org.apache.uima.fit.factory.JCasFactory.createText;
 import static org.apache.uima.fit.util.CasUtil.toText;
 import static org.apache.uima.fit.util.JCasUtil.select;
 import static org.assertj.core.api.Assertions.assertThat;
 
-import org.apache.uima.fit.factory.JCasFactory;
 import org.junit.jupiter.api.Test;
 
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
-import de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils;
 
 public class SegmentationUtilsTest
 {
     @Test
     public void testSplitSentences() throws Exception
     {
-        var jcas = JCasFactory.createText("I am one. I am two.", "en");
+        var jcas = createText("I am one. I am two.", "en");
 
-        SegmentationUtils.splitSentences(jcas.getCas());
+        splitSentences(jcas.getCas());
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("I am one.", "I am two.");
@@ -47,11 +48,11 @@ public void testSplitSentences() throws Exception
     @Test
     public void testSplitSentencesWithZones() throws Exception
     {
-        var jcas = JCasFactory.createText("Heading I am two.", "en");
+        var jcas = createText("Heading I am two.", "en");
         new Heading(jcas, 0, 7).addToIndexes();
         new Paragraph(jcas, 8, 17).addToIndexes();
 
-        SegmentationUtils.splitSentences(jcas.getCas(), jcas.select(Div.class));
+        splitSentences(jcas.getCas(), jcas.select(Div.class));
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("Heading", "I am two.");
@@ -60,16 +61,34 @@ public void testSplitSentencesWithZones() throws Exception
     @Test
     public void testTokenize() throws Exception
     {
-        var jcas = JCasFactory.createText("i am one.i am two.", "en");
+        var jcas = createText("i am one.i am two.", "en");
         new Sentence(jcas, 0, 9).addToIndexes();
         new Sentence(jcas, 9, 18).addToIndexes();
 
-        SegmentationUtils.tokenize(jcas.getCas());
+        tokenize(jcas.getCas());
 
         assertThat(toText(select(jcas, Sentence.class))) //
                 .containsExactly("i am one.", "i am two.");
 
         assertThat(toText(select(jcas, Token.class))) //
                 .containsExactly("i", "am", "one", ".", "i", "am", "two", ".");
     }
+
+    @Test
+    public void testTokenizeWitZones() throws Exception
+    {
+        var jcas = createText("i am one.i am two.", "en");
+        new Sentence(jcas, 0, 9).addToIndexes();
+        new Sentence(jcas, 9, 18).addToIndexes();
+        new Div(jcas, 3, 3).addToIndexes();
+        new Div(jcas, 12, 15).addToIndexes();
+
+        tokenize(jcas.getCas(), jcas.select(Div.class));
+
+        assertThat(toText(select(jcas, Sentence.class))) //
+                .containsExactly("i am one.", "i am two.");
+
+        assertThat(toText(select(jcas, Token.class))) //
+                .containsExactly("i", "a", "m", "one", ".", "i", "a", "m", "t", "wo", ".");
+    }
 }
diff --git a/...on-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java b/...on-support/src/main/java/de/tudarmstadt/ukp/inception/support/uima/SegmentationUtils.java
@@ -20,15 +20,15 @@
 import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.createSentence;
 import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.createToken;
 import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.selectSentences;
-import static org.apache.uima.fit.util.CasUtil.getType;
+import static java.text.BreakIterator.DONE;
+import static java.util.Locale.US;
 
 import java.text.BreakIterator;
 import java.util.Locale;
 
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.text.AnnotationFS;
 
-import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
 import de.tudarmstadt.ukp.inception.support.text.TrimUtils;
 import it.unimi.dsi.fastutil.ints.IntArrayList;
 
@@ -42,7 +42,7 @@ private SegmentationUtils()
     public static void segment(CAS aCas)
     {
         splitSentences(aCas, null);
-        tokenize(aCas);
+        tokenize(aCas, null);
     }
 
     public static void splitSentences(CAS aCas)
@@ -56,12 +56,11 @@ public static void splitSentences(CAS aCas, int aBegin, int aEnd)
         bi.setText(aCas.getDocumentText().substring(aBegin, aEnd));
         var last = bi.first();
         var cur = bi.next();
-        while (cur != BreakIterator.DONE) {
-            var sentence = aCas.createAnnotation(getType(aCas, Sentence.class), last + aBegin,
-                    cur + aBegin);
-            sentence.trim();
-            if (sentence.getBegin() != sentence.getEnd()) {
-                aCas.addFsToIndexes(sentence);
+        while (cur != DONE) {
+            var span = new int[] { last + aBegin, cur + aBegin };
+            TrimUtils.trim(aCas.getDocumentText(), span);
+            if (!isEmpty(span[0], span[1])) {
+                aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
             }
             last = cur;
             cur = bi.next();
@@ -74,67 +73,91 @@ public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZo
             return;
         }
 
-        int[] sortedZoneBoundaries = null;
-
-        if (aZones != null) {
-            var zoneBoundaries = new IntArrayList();
-            for (var zone : aZones) {
-                zoneBoundaries.add(zone.getBegin());
-                zoneBoundaries.add(zone.getEnd());
-            }
-
-            sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
-        }
-
-        if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
-            sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
-        }
+        int[] sortedZoneBoundaries = sortedZoneBoundaries(aCas, aZones);
 
         for (int i = 1; i < sortedZoneBoundaries.length; i++) {
             var begin = sortedZoneBoundaries[i - 1];
             var end = sortedZoneBoundaries[i];
-            var bi = BreakIterator.getSentenceInstance(Locale.US);
-            bi.setText(aCas.getDocumentText().substring(begin, end));
-            var last = bi.first();
-            var cur = bi.next();
-            while (cur != BreakIterator.DONE) {
-                var span = new int[] { last + begin, cur + begin };
-                TrimUtils.trim(aCas.getDocumentText(), span);
-                if (!isEmpty(span[0], span[1])) {
-                    aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
-                }
-                last = cur;
-                cur = bi.next();
-            }
+
+            splitSentences(aCas, begin, end);
         }
     }
 
     public static void tokenize(CAS aCas)
+    {
+        tokenize(aCas, null);
+    }
+
+    public static void tokenize(CAS aCas, Iterable<? extends AnnotationFS> aZones)
     {
         if (aCas.getDocumentText() == null) {
             return;
         }
 
-        BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
-        for (AnnotationFS s : selectSentences(aCas)) {
-            bi.setText(s.getCoveredText());
-            int last = bi.first();
-            int cur = bi.next();
-            while (cur != BreakIterator.DONE) {
-                int[] span = new int[] { last, cur };
-                TrimUtils.trim(s.getCoveredText(), span);
-                if (!isEmpty(span[0], span[1])) {
-                    aCas.addFsToIndexes(
-                            createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
-                }
-                last = cur;
-                cur = bi.next();
+        var sortedZoneBoundaries = sortedZoneBoundaries(aCas, aZones);
+        var zbi = 0;
+
+        for (var s : selectSentences(aCas)) {
+            var innerZoneBoundariesBuffer = new IntArrayList();
+            innerZoneBoundariesBuffer.add(s.getBegin());
+            innerZoneBoundariesBuffer.add(s.getEnd());
+            while (zbi < sortedZoneBoundaries.length && sortedZoneBoundaries[zbi] >= s.getBegin()
+                    && sortedZoneBoundaries[zbi] < s.getEnd()) {
+                innerZoneBoundariesBuffer.add(sortedZoneBoundaries[zbi]);
+                zbi++;
+            }
+
+            var innerZoneBoundaries = innerZoneBoundariesBuffer.intStream().distinct().sorted()
+                    .toArray();
+
+            for (int i = 1; i < innerZoneBoundaries.length; i++) {
+                var begin = innerZoneBoundaries[i - 1];
+                var end = innerZoneBoundaries[i];
+                tokenize(aCas, begin, end);
+            }
+        }
+    }
+
+    private static void tokenize(CAS aCas, int aBegin, int aEnd)
+    {
+        var bi = BreakIterator.getWordInstance(US);
+        bi.setText(aCas.getDocumentText().substring(aBegin, aEnd));
+        var last = bi.first();
+        var cur = bi.next();
+        while (cur != DONE) {
+            var span = new int[] { last + aBegin, cur + aBegin };
+            TrimUtils.trim(aCas.getDocumentText(), span);
+            if (!isEmpty(span[0], span[1])) {
+                aCas.addFsToIndexes(createToken(aCas, span[0], span[1]));
             }
+            last = cur;
+            cur = bi.next();
         }
     }
 
     public static boolean isEmpty(int aBegin, int aEnd)
     {
         return aBegin >= aEnd;
     }
+
+    private static int[] sortedZoneBoundaries(CAS aCas, Iterable<? extends AnnotationFS> aZones)
+    {
+        int[] sortedZoneBoundaries = null;
+
+        if (aZones != null) {
+            var zoneBoundaries = new IntArrayList();
+            for (var zone : aZones) {
+                zoneBoundaries.add(zone.getBegin());
+                zoneBoundaries.add(zone.getEnd());
+            }
+
+            sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
+        }
+
+        if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
+            sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
+        }
+
+        return sortedZoneBoundaries;
+    }
 }