Skip to content

Commit

Permalink
#1828 - A simple import of already tokenized text to which no further…
Browse files Browse the repository at this point in the history
… tokenization is applied

- Added simple reader for pre-tokenized text / one sentence per line / tokens whitespace separated
  • Loading branch information
reckart committed Jan 21, 2021
1 parent 16c7479 commit 1e05635
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
[[sect_formats_text]]
= Plain Text

Basic UTF-8 plain text.
Basic UTF-8 plain text. Automatic sentence and token detection will be performed.

[cols="2,1,1,1,3"]
|====
Expand All @@ -29,3 +29,36 @@ Basic UTF-8 plain text.
| no
| No annotations
|====

[[sect_formats_text_sentence_per_line]]
= Plain Text (one sentence per line)

Basic UTF-8 plain text where each line is interpreted as one sentence.

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| Plain text
| yes
| no
| no
| No annotations
|====

[[sect_formats_text_pretokenized]]
= Plain Text (pretokenized)

Basic UTF-8 plain text. Tokens are taken to be separated by spaces. Each line is interpreted as a
sentence.

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| Plain text
| yes
| no
| no
| No annotations
|====
5 changes: 5 additions & 0 deletions webanno-io-text/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-parameter-asl</artifactId>
</dependency>
<dependency>
<groupId>org.assertj</groupId>
<artifactId>assertj-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.text;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.jcas.JCas;
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

/**
* UIMA collection reader for plain text files, whitespace-separated tokens, one sentence per line.
*/
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class PretokenizedLineOrientedTextReader
extends JCasResourceCollectionReader_ImplBase
{
private static final Pattern whitespace = Pattern.compile("\\s+");

@Override
public void getNext(JCas aJCas) throws IOException, CollectionException
{
Resource res = nextFile();
initCas(aJCas, res);

try (InputStream is = new BufferedInputStream(res.getInputStream())) {
aJCas.setDocumentText(IOUtils.toString(is, "UTF-8"));
}

String t = aJCas.getDocumentText();
int start = 0;
int end = t.indexOf('\n');
while (end >= 0) {
createSentence(aJCas, start, end);
start = end + 1;
if (start < t.length()) {
end = t.indexOf('\n', start);
}
else {
end = -1;
}
}

if (start < t.length()) {
createSentence(aJCas, start, t.length());
}
}

protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd)
{
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
Sentence seg = new Sentence(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);

Matcher whitespaceMatcher = whitespace.matcher(seg.getCoveredText());
int prevBegin = 0;
while (whitespaceMatcher.find()) {
int end = whitespaceMatcher.start();
createToken(aJCas, seg.getBegin() + prevBegin, seg.getBegin() + end);
prevBegin = whitespaceMatcher.end();
}

if (prevBegin < aEnd) {
createToken(aJCas, seg.getBegin() + prevBegin, seg.getEnd());
}

return seg;
}
else {
return null;
}
}

protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd)
{
int[] span = new int[] { aBegin, aEnd };
trim(aJCas.getDocumentText(), span);
if (!isEmpty(span[0], span[1])) {
Token seg = new Token(aJCas, span[0], span[1]);
seg.addToIndexes(aJCas);
return seg;
}
else {
return null;
}
}

/**
* Remove trailing or leading whitespace from the annotation.
*
* @param aText
* the text.
* @param aSpan
* the offsets.
*/
public void trim(String aText, int[] aSpan)
{
int begin = aSpan[0];
int end = aSpan[1] - 1;

while ((begin < (aText.length() - 1)) && trimChar(aText.charAt(begin))) {
begin++;
}
while ((end > 0) && trimChar(aText.charAt(end))) {
end--;
}
end++;

aSpan[0] = begin;
aSpan[1] = end;
}

public boolean isEmpty(int aBegin, int aEnd)
{
return aBegin >= aEnd;
}

public boolean trimChar(final char aChar)
{
switch (aChar) {
case '\n':
return true; // Line break
case '\r':
return true; // Carriage return
case '\t':
return true; // Tab
case '\u200E':
return true; // LEFT-TO-RIGHT MARK
case '\u200F':
return true; // RIGHT-TO-LEFT MARK
case '\u2028':
return true; // LINE SEPARATOR
case '\u2029':
return true; // PARAGRAPH SEPARATOR
default:
return Character.isWhitespace(aChar);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.text;

import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;

import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.springframework.stereotype.Component;

import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;

@Component
public class PretokenizedTextFormatSupport
implements FormatSupport
{
public static final String ID = "pretokenized-textlines";
public static final String NAME = "Plain text (space-separated tokens, one sentence per line)";

@Override
public String getId()
{
return ID;
}

@Override
public String getName()
{
return NAME;
}

@Override
public boolean isReadable()
{
return true;
}

@Override
public CollectionReaderDescription getReaderDescription(TypeSystemDescription aTSD)
throws ResourceInitializationException
{
return createReaderDescription(PretokenizedLineOrientedTextReader.class, aTSD);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.clarin.webanno.text;

import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader;
import static org.apache.uima.fit.util.JCasUtil.select;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.Assert.assertEquals;

import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.factory.JCasFactory;
import org.apache.uima.jcas.JCas;
import org.junit.Test;

import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;

public class PretokenizedLineOrientedTextReaderTest
{
@Test
public void test() throws Exception
{
JCas doc = JCasFactory.createJCas();

CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, "LICENSE.txt");

reader.getNext(doc.getCas());

assertEquals(169, select(doc, Sentence.class).size());
assertEquals(1412, select(doc, Token.class).size());
}

@Test
public void testSpaceSeparatedNoFinalLineBreak() throws Exception
{
JCas doc = JCasFactory.createJCas();

CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION,
"src/test/resources/text/space-separated-no-final-line-break.txt");

reader.getNext(doc.getCas());

assertThat(select(doc, Sentence.class)) //
.extracting(s -> s.getCoveredText()) //
.containsExactly("1 2 3", "4 5 6", "7 8 9");

assertThat(select(doc, Token.class)) //
.extracting(t -> t.getCoveredText()) //
.containsExactly("1", "2", "3", "4", "5", "6", "7", "8", "9");
}

@Test
public void testTextWithPunctuation() throws Exception
{
JCas doc = JCasFactory.createJCas();

CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION,
"src/test/resources/text/text-with-punctuation.txt");

reader.getNext(doc.getCas());

assertThat(select(doc, Sentence.class)) //
.extracting(s -> s.getCoveredText()) //
.containsExactly("Yesterday, I left the house.");

assertThat(select(doc, Token.class)) //
.extracting(t -> t.getCoveredText()) //
.containsExactly("Yesterday,", "I", "left", "the", "house.");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1 2 3
4 5 6
7 8 9
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Yesterday, I left the house.

0 comments on commit 1e05635

Please sign in to comment.