diff --git a/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc b/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc
index b17a759c5fd..874d57d3059 100644
--- a/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc
+++ b/webanno-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-text.adoc
@@ -17,7 +17,7 @@
[[sect_formats_text]]
= Plain Text
-Basic UTF-8 plain text.
+Basic UTF-8 plain text. Automatic sentence and token detection will be performed.
[cols="2,1,1,1,3"]
|====
@@ -29,3 +29,36 @@ Basic UTF-8 plain text.
| no
| No annotations
|====
+
+[[sect_formats_text_sentence_per_line]]
+= Plain Text (one sentence per line)
+
+Basic UTF-8 plain text where each line is interpreted as one sentence.
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| Plain text
+| yes
+| no
+| no
+| No annotations
+|====
+
+[[sect_formats_text_pretokenized]]
+= Plain Text (pretokenized)
+
+Basic UTF-8 plain text. Tokens are taken to be separated by spaces. Each line is interpreted as a
+sentence.
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| Plain text
+| yes
+| no
+| no
+| No annotations
+|====
diff --git a/webanno-io-text/pom.xml b/webanno-io-text/pom.xml
index fb8a15b1c0d..2e7665b16f6 100644
--- a/webanno-io-text/pom.xml
+++ b/webanno-io-text/pom.xml
@@ -65,6 +65,11 @@
org.dkpro.core
dkpro-core-api-parameter-asl
+
+ org.assertj
+ assertj-core
+ test
+
junit
junit
diff --git a/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java
new file mode 100644
index 00000000000..6327ac72a88
--- /dev/null
+++ b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.tudarmstadt.ukp.clarin.webanno.text;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.fit.descriptor.TypeCapability;
+import org.apache.uima.jcas.JCas;
+import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase;
+
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+
+/**
+ * UIMA collection reader for plain text files, whitespace-separated tokens, one sentence per line.
+ */
+@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
+public class PretokenizedLineOrientedTextReader
+ extends JCasResourceCollectionReader_ImplBase
+{
+ private static final Pattern whitespace = Pattern.compile("\\s+");
+
+ @Override
+ public void getNext(JCas aJCas) throws IOException, CollectionException
+ {
+ Resource res = nextFile();
+ initCas(aJCas, res);
+
+ try (InputStream is = new BufferedInputStream(res.getInputStream())) {
+ aJCas.setDocumentText(IOUtils.toString(is, "UTF-8"));
+ }
+
+ String t = aJCas.getDocumentText();
+ int start = 0;
+ int end = t.indexOf('\n');
+ while (end >= 0) {
+ createSentence(aJCas, start, end);
+ start = end + 1;
+ if (start < t.length()) {
+ end = t.indexOf('\n', start);
+ }
+ else {
+ end = -1;
+ }
+ }
+
+ if (start < t.length()) {
+ createSentence(aJCas, start, t.length());
+ }
+ }
+
+ protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd)
+ {
+ int[] span = new int[] { aBegin, aEnd };
+ trim(aJCas.getDocumentText(), span);
+ if (!isEmpty(span[0], span[1])) {
+ Sentence seg = new Sentence(aJCas, span[0], span[1]);
+ seg.addToIndexes(aJCas);
+
+ Matcher whitespaceMatcher = whitespace.matcher(seg.getCoveredText());
+ int prevBegin = 0;
+ while (whitespaceMatcher.find()) {
+ int end = whitespaceMatcher.start();
+ createToken(aJCas, seg.getBegin() + prevBegin, seg.getBegin() + end);
+ prevBegin = whitespaceMatcher.end();
+ }
+
+ if (prevBegin < aEnd) {
+ createToken(aJCas, seg.getBegin() + prevBegin, seg.getEnd());
+ }
+
+ return seg;
+ }
+ else {
+ return null;
+ }
+ }
+
+ protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd)
+ {
+ int[] span = new int[] { aBegin, aEnd };
+ trim(aJCas.getDocumentText(), span);
+ if (!isEmpty(span[0], span[1])) {
+ Token seg = new Token(aJCas, span[0], span[1]);
+ seg.addToIndexes(aJCas);
+ return seg;
+ }
+ else {
+ return null;
+ }
+ }
+
+ /**
+ * Remove trailing or leading whitespace from the annotation.
+ *
+ * @param aText
+ * the text.
+ * @param aSpan
+ * the offsets.
+ */
+ public void trim(String aText, int[] aSpan)
+ {
+ int begin = aSpan[0];
+ int end = aSpan[1] - 1;
+
+ while ((begin < (aText.length() - 1)) && trimChar(aText.charAt(begin))) {
+ begin++;
+ }
+ while ((end > 0) && trimChar(aText.charAt(end))) {
+ end--;
+ }
+ end++;
+
+ aSpan[0] = begin;
+ aSpan[1] = end;
+ }
+
+ public boolean isEmpty(int aBegin, int aEnd)
+ {
+ return aBegin >= aEnd;
+ }
+
+ public boolean trimChar(final char aChar)
+ {
+ switch (aChar) {
+ case '\n':
+ return true; // Line break
+ case '\r':
+ return true; // Carriage return
+ case '\t':
+ return true; // Tab
+ case '\u200E':
+ return true; // LEFT-TO-RIGHT MARK
+ case '\u200F':
+ return true; // RIGHT-TO-LEFT MARK
+ case '\u2028':
+ return true; // LINE SEPARATOR
+ case '\u2029':
+ return true; // PARAGRAPH SEPARATOR
+ default:
+ return Character.isWhitespace(aChar);
+ }
+ }
+}
diff --git a/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java
new file mode 100644
index 00000000000..0c9a19670e4
--- /dev/null
+++ b/webanno-io-text/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.tudarmstadt.ukp.clarin.webanno.text;
+
+import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
+
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.springframework.stereotype.Component;
+
+import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;
+
+@Component
+public class PretokenizedTextFormatSupport
+ implements FormatSupport
+{
+ public static final String ID = "pretokenized-textlines";
+ public static final String NAME = "Plain text (space-separated tokens, one sentence per line)";
+
+ @Override
+ public String getId()
+ {
+ return ID;
+ }
+
+ @Override
+ public String getName()
+ {
+ return NAME;
+ }
+
+ @Override
+ public boolean isReadable()
+ {
+ return true;
+ }
+
+ @Override
+ public CollectionReaderDescription getReaderDescription(TypeSystemDescription aTSD)
+ throws ResourceInitializationException
+ {
+ return createReaderDescription(PretokenizedLineOrientedTextReader.class, aTSD);
+ }
+}
diff --git a/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java b/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java
new file mode 100644
index 00000000000..77828cabb64
--- /dev/null
+++ b/webanno-io-text/src/test/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.tudarmstadt.ukp.clarin.webanno.text;
+
+import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader;
+import static org.apache.uima.fit.util.JCasUtil.select;
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.junit.Assert.assertEquals;
+
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.factory.JCasFactory;
+import org.apache.uima.jcas.JCas;
+import org.junit.Test;
+
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+
+public class PretokenizedLineOrientedTextReaderTest
+{
+ @Test
+ public void test() throws Exception
+ {
+ JCas doc = JCasFactory.createJCas();
+
+ CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
+ PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, "LICENSE.txt");
+
+ reader.getNext(doc.getCas());
+
+ assertEquals(169, select(doc, Sentence.class).size());
+ assertEquals(1412, select(doc, Token.class).size());
+ }
+
+ @Test
+ public void testSpaceSeparatedNoFinalLineBreak() throws Exception
+ {
+ JCas doc = JCasFactory.createJCas();
+
+ CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
+ PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION,
+ "src/test/resources/text/space-separated-no-final-line-break.txt");
+
+ reader.getNext(doc.getCas());
+
+ assertThat(select(doc, Sentence.class)) //
+ .extracting(s -> s.getCoveredText()) //
+ .containsExactly("1 2 3", "4 5 6", "7 8 9");
+
+ assertThat(select(doc, Token.class)) //
+ .extracting(t -> t.getCoveredText()) //
+ .containsExactly("1", "2", "3", "4", "5", "6", "7", "8", "9");
+ }
+
+ @Test
+ public void testTextWithPunctuation() throws Exception
+ {
+ JCas doc = JCasFactory.createJCas();
+
+ CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class,
+ PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION,
+ "src/test/resources/text/text-with-punctuation.txt");
+
+ reader.getNext(doc.getCas());
+
+ assertThat(select(doc, Sentence.class)) //
+ .extracting(s -> s.getCoveredText()) //
+ .containsExactly("Yesterday, I left the house.");
+
+ assertThat(select(doc, Token.class)) //
+ .extracting(t -> t.getCoveredText()) //
+ .containsExactly("Yesterday,", "I", "left", "the", "house.");
+ }
+}
diff --git a/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt b/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt
new file mode 100644
index 00000000000..86bb2a32b06
--- /dev/null
+++ b/webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt
@@ -0,0 +1,3 @@
+1 2 3
+4 5 6
+7 8 9
\ No newline at end of file
diff --git a/webanno-io-text/src/test/resources/text/text-with-punctuation.txt b/webanno-io-text/src/test/resources/text/text-with-punctuation.txt
new file mode 100644
index 00000000000..a07f2273271
--- /dev/null
+++ b/webanno-io-text/src/test/resources/text/text-with-punctuation.txt
@@ -0,0 +1 @@
+Yesterday, I left the house.
\ No newline at end of file