diff --git a/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc b/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc index b17a759c5fd..874d57d3059 100644 --- a/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc +++ b/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc @@ -17,7 +17,7 @@ [[sect_formats_text]] = Plain Text -Basic UTF-8 plain text. +Basic UTF-8 plain text. Automatic sentence and token detection will be performed. [cols="2,1,1,1,3"] |==== @@ -29,3 +29,36 @@ Basic UTF-8 plain text. | no | No annotations |==== + +[[sect_formats_text_sentence_per_line]] += Plain Text (one sentence per line) + +Basic UTF-8 plain text where each line is interpreted as one sentence. + +[cols="2,1,1,1,3"] +|==== +| Format | Read | Write | Custom Layers | Description + +| Plain text +| yes +| no +| no +| No annotations +|==== + +[[sect_formats_text_pretokenized]] += Plain Text (pretokenized) + +Basic UTF-8 plain text. Tokens are taken to be separated by spaces. Each line is interpreted as a +sentence. + +[cols="2,1,1,1,3"] +|==== +| Format | Read | Write | Custom Layers | Description + +| Plain text +| yes +| no +| no +| No annotations +|==== diff --git a/webanno-io-text/pom.xml b/webanno-io-text/pom.xml index fb8a15b1c0d..2e7665b16f6 100644 --- a/webanno-io-text/pom.xml +++ b/webanno-io-text/pom.xml @@ -65,6 +65,11 @@ org.dkpro.core dkpro-core-api-parameter-asl + + org.assertj + assertj-core + test + junit junit diff --git a/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java new file mode 100644 index 00000000000..6327ac72a88 --- /dev/null +++ b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.clarin.webanno.text; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.commons.io.IOUtils; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.fit.descriptor.TypeCapability; +import org.apache.uima.jcas.JCas; +import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +/** + * UIMA collection reader for plain text files, whitespace-separated tokens, one sentence per line. + */ +@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) +public class PretokenizedLineOrientedTextReader + extends JCasResourceCollectionReader_ImplBase +{ + private static final Pattern whitespace = Pattern.compile("\\s+"); + + @Override + public void getNext(JCas aJCas) throws IOException, CollectionException + { + Resource res = nextFile(); + initCas(aJCas, res); + + try (InputStream is = new BufferedInputStream(res.getInputStream())) { + aJCas.setDocumentText(IOUtils.toString(is, "UTF-8")); + } + + String t = aJCas.getDocumentText(); + int start = 0; + int end = t.indexOf('\n'); + while (end >= 0) { + createSentence(aJCas, start, end); + start = end + 1; + if (start < t.length()) { + end = t.indexOf('\n', start); + } + else { + end = -1; + } + } + + if (start < t.length()) { + createSentence(aJCas, start, t.length()); + } + } + + protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd) + { + int[] span = new int[] { aBegin, aEnd }; + trim(aJCas.getDocumentText(), span); + if (!isEmpty(span[0], span[1])) { + Sentence seg = new Sentence(aJCas, span[0], span[1]); + seg.addToIndexes(aJCas); + + Matcher whitespaceMatcher = whitespace.matcher(seg.getCoveredText()); + int prevBegin = 0; + while (whitespaceMatcher.find()) { + int end = whitespaceMatcher.start(); + createToken(aJCas, seg.getBegin() + prevBegin, seg.getBegin() + end); + prevBegin = whitespaceMatcher.end(); + } + + if (prevBegin < aEnd) { + createToken(aJCas, seg.getBegin() + prevBegin, seg.getEnd()); + } + + return seg; + } + else { + return null; + } + } + + protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd) + { + int[] span = new int[] { aBegin, aEnd }; + trim(aJCas.getDocumentText(), span); + if (!isEmpty(span[0], span[1])) { + Token seg = new Token(aJCas, span[0], span[1]); + seg.addToIndexes(aJCas); + return seg; + } + else { + return null; + } + } + + /** + * Remove trailing or leading whitespace from the annotation. + * + * @param aText + * the text. + * @param aSpan + * the offsets. + */ + public void trim(String aText, int[] aSpan) + { + int begin = aSpan[0]; + int end = aSpan[1] - 1; + + while ((begin < (aText.length() - 1)) && trimChar(aText.charAt(begin))) { + begin++; + } + while ((end > 0) && trimChar(aText.charAt(end))) { + end--; + } + end++; + + aSpan[0] = begin; + aSpan[1] = end; + } + + public boolean isEmpty(int aBegin, int aEnd) + { + return aBegin >= aEnd; + } + + public boolean trimChar(final char aChar) + { + switch (aChar) { + case '\n': + return true; // Line break + case '\r': + return true; // Carriage return + case '\t': + return true; // Tab + case '\u200E': + return true; // LEFT-TO-RIGHT MARK + case '\u200F': + return true; // RIGHT-TO-LEFT MARK + case '\u2028': + return true; // LINE SEPARATOR + case '\u2029': + return true; // PARAGRAPH SEPARATOR + default: + return Character.isWhitespace(aChar); + } + } +} diff --git a/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java new file mode 100644 index 00000000000..0c9a19670e4 --- /dev/null +++ b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.clarin.webanno.text; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; + +import org.apache.uima.collection.CollectionReaderDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.metadata.TypeSystemDescription; +import org.springframework.stereotype.Component; + +import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; + +@Component +public class PretokenizedTextFormatSupport + implements FormatSupport +{ + public static final String ID = "pretokenized-textlines"; + public static final String NAME = "Plain text (space-separated tokens, one sentence per line)"; + + @Override + public String getId() + { + return ID; + } + + @Override + public String getName() + { + return NAME; + } + + @Override + public boolean isReadable() + { + return true; + } + + @Override + public CollectionReaderDescription getReaderDescription(TypeSystemDescription aTSD) + throws ResourceInitializationException + { + return createReaderDescription(PretokenizedLineOrientedTextReader.class, aTSD); + } +} diff --git a/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java b/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java new file mode 100644 index 00000000000..77828cabb64 --- /dev/null +++ b/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.clarin.webanno.text; + +import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; +import static org.apache.uima.fit.util.JCasUtil.select; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.JCasFactory; +import org.apache.uima.jcas.JCas; +import org.junit.Test; + +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + +public class PretokenizedLineOrientedTextReaderTest +{ + @Test + public void test() throws Exception + { + JCas doc = JCasFactory.createJCas(); + + CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, + PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, "LICENSE.txt"); + + reader.getNext(doc.getCas()); + + assertEquals(169, select(doc, Sentence.class).size()); + assertEquals(1412, select(doc, Token.class).size()); + } + + @Test + public void testSpaceSeparatedNoFinalLineBreak() throws Exception + { + JCas doc = JCasFactory.createJCas(); + + CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, + PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, + "src/test/resources/text/space-separated-no-final-line-break.txt"); + + reader.getNext(doc.getCas()); + + assertThat(select(doc, Sentence.class)) // + .extracting(s -> s.getCoveredText()) // + .containsExactly("1 2 3", "4 5 6", "7 8 9"); + + assertThat(select(doc, Token.class)) // + .extracting(t -> t.getCoveredText()) // + .containsExactly("1", "2", "3", "4", "5", "6", "7", "8", "9"); + } + + @Test + public void testTextWithPunctuation() throws Exception + { + JCas doc = JCasFactory.createJCas(); + + CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, + PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, + "src/test/resources/text/text-with-punctuation.txt"); + + reader.getNext(doc.getCas()); + + assertThat(select(doc, Sentence.class)) // + .extracting(s -> s.getCoveredText()) // + .containsExactly("Yesterday, I left the house."); + + assertThat(select(doc, Token.class)) // + .extracting(t -> t.getCoveredText()) // + .containsExactly("Yesterday,", "I", "left", "the", "house."); + } +} diff --git a/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt b/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt new file mode 100644 index 00000000000..86bb2a32b06 --- /dev/null +++ b/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt @@ -0,0 +1,3 @@ +1 2 3 +4 5 6 +7 8 9 \ No newline at end of file diff --git a/webanno-io-text/src/test/resources/text/text-with-punctuation.txt b/webanno-io-text/src/test/resources/text/text-with-punctuation.txt new file mode 100644 index 00000000000..a07f2273271 --- /dev/null +++ b/webanno-io-text/src/test/resources/text/text-with-punctuation.txt @@ -0,0 +1 @@ +Yesterday, I left the house. \ No newline at end of file