Skip to content

Commit

Permalink
#5043 - Ability to specify token breaking zones when calling tokenizer
Browse files Browse the repository at this point in the history
- Added new signature to the tokenizer call
- Added test
- Consolicated existing code
  • Loading branch information
reckart committed Sep 7, 2024
1 parent 79d0e49 commit 41b137d
Show file tree
Hide file tree
Showing 2 changed files with 101 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,28 +17,29 @@
*/
package de.tudarmstadt.ukp.inception.export;

import static de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils.splitSentences;
import static de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils.tokenize;
import static org.apache.uima.fit.factory.JCasFactory.createText;
import static org.apache.uima.fit.util.CasUtil.toText;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.assertj.core.api.Assertions.assertThat;

import org.apache.uima.fit.factory.JCasFactory;
import org.junit.jupiter.api.Test;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Div;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Heading;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.inception.support.uima.SegmentationUtils;

public class SegmentationUtilsTest
{
@Test
public void testSplitSentences() throws Exception
{
var jcas = JCasFactory.createText("I am one. I am two.", "en");
var jcas = createText("I am one. I am two.", "en");

SegmentationUtils.splitSentences(jcas.getCas());
splitSentences(jcas.getCas());

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("I am one.", "I am two.");
Expand All @@ -47,11 +48,11 @@ public void testSplitSentences() throws Exception
@Test
public void testSplitSentencesWithZones() throws Exception
{
var jcas = JCasFactory.createText("Heading I am two.", "en");
var jcas = createText("Heading I am two.", "en");
new Heading(jcas, 0, 7).addToIndexes();
new Paragraph(jcas, 8, 17).addToIndexes();

SegmentationUtils.splitSentences(jcas.getCas(), jcas.select(Div.class));
splitSentences(jcas.getCas(), jcas.select(Div.class));

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("Heading", "I am two.");
Expand All @@ -60,16 +61,34 @@ public void testSplitSentencesWithZones() throws Exception
@Test
public void testTokenize() throws Exception
{
var jcas = JCasFactory.createText("i am one.i am two.", "en");
var jcas = createText("i am one.i am two.", "en");
new Sentence(jcas, 0, 9).addToIndexes();
new Sentence(jcas, 9, 18).addToIndexes();

SegmentationUtils.tokenize(jcas.getCas());
tokenize(jcas.getCas());

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("i am one.", "i am two.");

assertThat(toText(select(jcas, Token.class))) //
.containsExactly("i", "am", "one", ".", "i", "am", "two", ".");
}

@Test
public void testTokenizeWitZones() throws Exception
{
var jcas = createText("i am one.i am two.", "en");
new Sentence(jcas, 0, 9).addToIndexes();
new Sentence(jcas, 9, 18).addToIndexes();
new Div(jcas, 3, 3).addToIndexes();
new Div(jcas, 12, 15).addToIndexes();

tokenize(jcas.getCas(), jcas.select(Div.class));

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("i am one.", "i am two.");

assertThat(toText(select(jcas, Token.class))) //
.containsExactly("i", "a", "m", "one", ".", "i", "a", "m", "t", "wo", ".");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@
import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.createSentence;
import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.createToken;
import static de.tudarmstadt.ukp.inception.support.uima.WebAnnoCasUtil.selectSentences;
import static org.apache.uima.fit.util.CasUtil.getType;
import static java.text.BreakIterator.DONE;
import static java.util.Locale.US;

import java.text.BreakIterator;
import java.util.Locale;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.inception.support.text.TrimUtils;
import it.unimi.dsi.fastutil.ints.IntArrayList;

Expand All @@ -42,7 +42,7 @@ private SegmentationUtils()
public static void segment(CAS aCas)
{
splitSentences(aCas, null);
tokenize(aCas);
tokenize(aCas, null);
}

public static void splitSentences(CAS aCas)
Expand All @@ -56,12 +56,11 @@ public static void splitSentences(CAS aCas, int aBegin, int aEnd)
bi.setText(aCas.getDocumentText().substring(aBegin, aEnd));
var last = bi.first();
var cur = bi.next();
while (cur != BreakIterator.DONE) {
var sentence = aCas.createAnnotation(getType(aCas, Sentence.class), last + aBegin,
cur + aBegin);
sentence.trim();
if (sentence.getBegin() != sentence.getEnd()) {
aCas.addFsToIndexes(sentence);
while (cur != DONE) {
var span = new int[] { last + aBegin, cur + aBegin };
TrimUtils.trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
Expand All @@ -74,67 +73,91 @@ public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZo
return;
}

int[] sortedZoneBoundaries = null;

if (aZones != null) {
var zoneBoundaries = new IntArrayList();
for (var zone : aZones) {
zoneBoundaries.add(zone.getBegin());
zoneBoundaries.add(zone.getEnd());
}

sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
}

if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
}
int[] sortedZoneBoundaries = sortedZoneBoundaries(aCas, aZones);

for (int i = 1; i < sortedZoneBoundaries.length; i++) {
var begin = sortedZoneBoundaries[i - 1];
var end = sortedZoneBoundaries[i];
var bi = BreakIterator.getSentenceInstance(Locale.US);
bi.setText(aCas.getDocumentText().substring(begin, end));
var last = bi.first();
var cur = bi.next();
while (cur != BreakIterator.DONE) {
var span = new int[] { last + begin, cur + begin };
TrimUtils.trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
}

splitSentences(aCas, begin, end);
}
}

public static void tokenize(CAS aCas)
{
tokenize(aCas, null);
}

public static void tokenize(CAS aCas, Iterable<? extends AnnotationFS> aZones)
{
if (aCas.getDocumentText() == null) {
return;
}

BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
for (AnnotationFS s : selectSentences(aCas)) {
bi.setText(s.getCoveredText());
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
TrimUtils.trim(s.getCoveredText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(
createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
}
last = cur;
cur = bi.next();
var sortedZoneBoundaries = sortedZoneBoundaries(aCas, aZones);
var zbi = 0;

for (var s : selectSentences(aCas)) {
var innerZoneBoundariesBuffer = new IntArrayList();
innerZoneBoundariesBuffer.add(s.getBegin());
innerZoneBoundariesBuffer.add(s.getEnd());
while (zbi < sortedZoneBoundaries.length && sortedZoneBoundaries[zbi] >= s.getBegin()
&& sortedZoneBoundaries[zbi] < s.getEnd()) {
innerZoneBoundariesBuffer.add(sortedZoneBoundaries[zbi]);
zbi++;
}

var innerZoneBoundaries = innerZoneBoundariesBuffer.intStream().distinct().sorted()
.toArray();

for (int i = 1; i < innerZoneBoundaries.length; i++) {
var begin = innerZoneBoundaries[i - 1];
var end = innerZoneBoundaries[i];
tokenize(aCas, begin, end);
}
}
}

private static void tokenize(CAS aCas, int aBegin, int aEnd)
{
var bi = BreakIterator.getWordInstance(US);
bi.setText(aCas.getDocumentText().substring(aBegin, aEnd));
var last = bi.first();
var cur = bi.next();
while (cur != DONE) {
var span = new int[] { last + aBegin, cur + aBegin };
TrimUtils.trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createToken(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
}
}

public static boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}

private static int[] sortedZoneBoundaries(CAS aCas, Iterable<? extends AnnotationFS> aZones)
{
int[] sortedZoneBoundaries = null;

if (aZones != null) {
var zoneBoundaries = new IntArrayList();
for (var zone : aZones) {
zoneBoundaries.add(zone.getBegin());
zoneBoundaries.add(zone.getEnd());
}

sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
}

if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
}

return sortedZoneBoundaries;
}
}

0 comments on commit 41b137d

Please sign in to comment.