Skip to content

Commit

Permalink
#4381 - Allow users to browse their past activity
Browse files Browse the repository at this point in the history
- Extract segmentation utils into separate reusable class
- Added tests for extraction of span suggestion and document metadata suggestions
  • Loading branch information
reckart committed Dec 17, 2023
1 parent 11c5211 commit 5ec0a1a
Show file tree
Hide file tree
Showing 6 changed files with 354 additions and 145 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,8 @@
*/
package de.tudarmstadt.ukp.inception.export;

import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createSentence;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createToken;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.exists;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.getRealCas;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.selectSentences;
import static de.tudarmstadt.ukp.clarin.webanno.api.casstorage.CasAccessMode.EXCLUSIVE_WRITE_ACCESS;
import static de.tudarmstadt.ukp.clarin.webanno.api.casstorage.CasAccessMode.UNMANAGED_ACCESS;
import static de.tudarmstadt.ukp.inception.project.api.ProjectService.DOCUMENT_FOLDER;
Expand All @@ -46,13 +43,11 @@
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.nio.file.Files;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;

import org.apache.commons.lang3.ClassUtils;
Expand All @@ -63,7 +58,6 @@
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.slf4j.Logger;
Expand Down Expand Up @@ -101,7 +95,6 @@
import de.tudarmstadt.ukp.inception.schema.api.AnnotationSchemaService;
import de.tudarmstadt.ukp.inception.support.logging.BaseLoggers;
import de.tudarmstadt.ukp.inception.support.logging.LogMessage;
import it.unimi.dsi.fastutil.ints.IntArrayList;

/**
* <p>
Expand Down Expand Up @@ -385,7 +378,7 @@ private void splitTokensIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat)
Type tokenType = getType(cas, Token.class);

if (!exists(cas, tokenType)) {
tokenize(cas);
SegmentationUtils.tokenize(cas);
}

if (properties.getMaxTokens() > 0) {
Expand All @@ -410,7 +403,7 @@ private void splitSenencesIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat
Type sentenceType = getType(cas, Sentence.class);

if (!exists(cas, sentenceType)) {
splitSentences(cas);
SegmentationUtils.splitSentences(cas);
}

if (properties.getMaxSentences() > 0) {
Expand All @@ -429,133 +422,6 @@ private void splitSenencesIfNecssaryAndCheckQuota(CAS cas, FormatSupport aFormat
}
}

public static void splitSentences(CAS aCas)
{
splitSentences(aCas, null);
}

public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
{
if (aCas.getDocumentText() == null) {
return;
}

int[] sortedZoneBoundaries = null;

if (aZones != null) {
var zoneBoundaries = new IntArrayList();
for (var zone : aZones) {
zoneBoundaries.add(zone.getBegin());
zoneBoundaries.add(zone.getEnd());
}

sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
}

if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
}

for (int i = 1; i < sortedZoneBoundaries.length; i++) {
var begin = sortedZoneBoundaries[i - 1];
var end = sortedZoneBoundaries[i];
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
bi.setText(aCas.getDocumentText().substring(begin, end));
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last + begin, cur + begin };
trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
}
}
}

public static void tokenize(CAS aCas)
{
if (aCas.getDocumentText() == null) {
return;
}

BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
for (AnnotationFS s : selectSentences(aCas)) {
bi.setText(s.getCoveredText());
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
trim(s.getCoveredText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(
createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
}
last = cur;
cur = bi.next();
}
}
}

/**
* Remove trailing or leading whitespace from the annotation.
*
* @param aText
* the text.
* @param aSpan
* the offsets.
*/
public static void trim(String aText, int[] aSpan)
{
String data = aText;

int begin = aSpan[0];
int end = aSpan[1] - 1;

// Remove whitespace at end
while ((end > 0) && trimChar(data.charAt(end))) {
end--;
}
end++;

// Remove whitespace at start
while ((begin < end) && trimChar(data.charAt(begin))) {
begin++;
}

aSpan[0] = begin;
aSpan[1] = end;
}

public static boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}

public static boolean trimChar(final char aChar)
{
switch (aChar) {
case '\n':
return true; // Line break
case '\r':
return true; // Carriage return
case '\t':
return true; // Tab
case '\u200E':
return true; // LEFT-TO-RIGHT MARK
case '\u200F':
return true; // RIGHT-TO-LEFT MARK
case '\u2028':
return true; // LINE SEPARATOR
case '\u2029':
return true; // PARAGRAPH SEPARATOR
default:
return Character.isWhitespace(aChar);
}
}

@Override
public File exportCasToFile(CAS aCas, SourceDocument aDocument, String aFileName,
FormatSupport aFormat)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.export;

import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createSentence;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.createToken;
import static de.tudarmstadt.ukp.clarin.webanno.api.annotation.util.WebAnnoCasUtil.selectSentences;

import java.text.BreakIterator;
import java.util.Locale;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;

import it.unimi.dsi.fastutil.ints.IntArrayList;

public abstract class SegmentationUtils
{
private SegmentationUtils()
{
// No instances
}

public static void splitSentences(CAS aCas)
{
splitSentences(aCas, null);
}

public static void splitSentences(CAS aCas, Iterable<? extends AnnotationFS> aZones)
{
if (aCas.getDocumentText() == null) {
return;
}

int[] sortedZoneBoundaries = null;

if (aZones != null) {
var zoneBoundaries = new IntArrayList();
for (var zone : aZones) {
zoneBoundaries.add(zone.getBegin());
zoneBoundaries.add(zone.getEnd());
}

sortedZoneBoundaries = zoneBoundaries.intStream().distinct().sorted().toArray();
}

if (sortedZoneBoundaries == null || sortedZoneBoundaries.length < 2) {
sortedZoneBoundaries = new int[] { 0, aCas.getDocumentText().length() };
}

for (int i = 1; i < sortedZoneBoundaries.length; i++) {
var begin = sortedZoneBoundaries[i - 1];
var end = sortedZoneBoundaries[i];
BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
bi.setText(aCas.getDocumentText().substring(begin, end));
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last + begin, cur + begin };
trim(aCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(createSentence(aCas, span[0], span[1]));
}
last = cur;
cur = bi.next();
}
}
}

public static void tokenize(CAS aCas)
{
if (aCas.getDocumentText() == null) {
return;
}

BreakIterator bi = BreakIterator.getWordInstance(Locale.US);
for (AnnotationFS s : selectSentences(aCas)) {
bi.setText(s.getCoveredText());
int last = bi.first();
int cur = bi.next();
while (cur != BreakIterator.DONE) {
int[] span = new int[] { last, cur };
trim(s.getCoveredText(), span);
if (!isEmpty(span[0], span[1])) {
aCas.addFsToIndexes(
createToken(aCas, span[0] + s.getBegin(), span[1] + s.getBegin()));
}
last = cur;
cur = bi.next();
}
}
}

/**
* Remove trailing or leading whitespace from the annotation.
*
* @param aText
* the text.
* @param aSpan
* the offsets.
*/
public static void trim(String aText, int[] aSpan)
{
String data = aText;

int begin = aSpan[0];
int end = aSpan[1] - 1;

// Remove whitespace at end
while ((end > 0) && trimChar(data.charAt(end))) {
end--;
}
end++;

// Remove whitespace at start
while ((begin < end) && trimChar(data.charAt(begin))) {
begin++;
}

aSpan[0] = begin;
aSpan[1] = end;
}

public static boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}

public static boolean trimChar(final char aChar)
{
switch (aChar) {
case '\n':
return true; // Line break
case '\r':
return true; // Carriage return
case '\t':
return true; // Tab
case '\u200E':
return true; // LEFT-TO-RIGHT MARK
case '\u200F':
return true; // RIGHT-TO-LEFT MARK
case '\u2028':
return true; // LINE SEPARATOR
case '\u2029':
return true; // PARAGRAPH SEPARATOR
default:
return Character.isWhitespace(aChar);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

public class SegmentationTest
public class SegmentationUtilsTest
{
@Test
public void testSplitSentences() throws Exception
{
JCas jcas = JCasFactory.createText("I am one. I am two.", "en");

DocumentImportExportServiceImpl.splitSentences(jcas.getCas());
SegmentationUtils.splitSentences(jcas.getCas());

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("I am one.", "I am two.");
Expand All @@ -51,7 +51,7 @@ public void testSplitSentencesWithZones() throws Exception
new Heading(jcas, 0, 7).addToIndexes();
new Paragraph(jcas, 8, 17).addToIndexes();

DocumentImportExportServiceImpl.splitSentences(jcas.getCas(), jcas.select(Div.class));
SegmentationUtils.splitSentences(jcas.getCas(), jcas.select(Div.class));

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("Heading", "I am two.");
Expand All @@ -64,7 +64,7 @@ public void testTokenize() throws Exception
new Sentence(jcas, 0, 9).addToIndexes();
new Sentence(jcas, 9, 18).addToIndexes();

DocumentImportExportServiceImpl.tokenize(jcas.getCas());
SegmentationUtils.tokenize(jcas.getCas());

assertThat(toText(select(jcas, Sentence.class))) //
.containsExactly("i am one.", "i am two.");
Expand Down
Loading

0 comments on commit 5ec0a1a

Please sign in to comment.