-
Notifications
You must be signed in to change notification settings - Fork 154
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1864 from webanno/feature/1828-pre-tokenized-text…
…-format-support #1828 - A simple import of already tokenized text to which no further tokenization is applied
- Loading branch information
Showing
7 changed files
with
356 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
165 changes: 165 additions & 0 deletions
165
.../main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
/* | ||
* Licensed to the Technische Universität Darmstadt under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The Technische Universität Darmstadt | ||
* licenses this file to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.tudarmstadt.ukp.clarin.webanno.text; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.regex.Matcher; | ||
import java.util.regex.Pattern; | ||
|
||
import org.apache.commons.io.IOUtils; | ||
import org.apache.uima.collection.CollectionException; | ||
import org.apache.uima.fit.descriptor.TypeCapability; | ||
import org.apache.uima.jcas.JCas; | ||
import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
|
||
/** | ||
* UIMA collection reader for plain text files, whitespace-separated tokens, one sentence per line. | ||
*/ | ||
@TypeCapability(outputs = { "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" }) | ||
public class PretokenizedLineOrientedTextReader | ||
extends JCasResourceCollectionReader_ImplBase | ||
{ | ||
private static final Pattern whitespace = Pattern.compile("\\s+"); | ||
|
||
@Override | ||
public void getNext(JCas aJCas) throws IOException, CollectionException | ||
{ | ||
Resource res = nextFile(); | ||
initCas(aJCas, res); | ||
|
||
try (InputStream is = new BufferedInputStream(res.getInputStream())) { | ||
aJCas.setDocumentText(IOUtils.toString(is, "UTF-8")); | ||
} | ||
|
||
String t = aJCas.getDocumentText(); | ||
int start = 0; | ||
int end = t.indexOf('\n'); | ||
while (end >= 0) { | ||
createSentence(aJCas, start, end); | ||
start = end + 1; | ||
if (start < t.length()) { | ||
end = t.indexOf('\n', start); | ||
} | ||
else { | ||
end = -1; | ||
} | ||
} | ||
|
||
if (start < t.length()) { | ||
createSentence(aJCas, start, t.length()); | ||
} | ||
} | ||
|
||
protected Sentence createSentence(final JCas aJCas, final int aBegin, final int aEnd) | ||
{ | ||
int[] span = new int[] { aBegin, aEnd }; | ||
trim(aJCas.getDocumentText(), span); | ||
if (!isEmpty(span[0], span[1])) { | ||
Sentence seg = new Sentence(aJCas, span[0], span[1]); | ||
seg.addToIndexes(aJCas); | ||
|
||
Matcher whitespaceMatcher = whitespace.matcher(seg.getCoveredText()); | ||
int prevBegin = 0; | ||
while (whitespaceMatcher.find()) { | ||
int end = whitespaceMatcher.start(); | ||
createToken(aJCas, seg.getBegin() + prevBegin, seg.getBegin() + end); | ||
prevBegin = whitespaceMatcher.end(); | ||
} | ||
|
||
if (prevBegin < aEnd) { | ||
createToken(aJCas, seg.getBegin() + prevBegin, seg.getEnd()); | ||
} | ||
|
||
return seg; | ||
} | ||
else { | ||
return null; | ||
} | ||
} | ||
|
||
protected Token createToken(final JCas aJCas, final int aBegin, final int aEnd) | ||
{ | ||
int[] span = new int[] { aBegin, aEnd }; | ||
trim(aJCas.getDocumentText(), span); | ||
if (!isEmpty(span[0], span[1])) { | ||
Token seg = new Token(aJCas, span[0], span[1]); | ||
seg.addToIndexes(aJCas); | ||
return seg; | ||
} | ||
else { | ||
return null; | ||
} | ||
} | ||
|
||
/** | ||
* Remove trailing or leading whitespace from the annotation. | ||
* | ||
* @param aText | ||
* the text. | ||
* @param aSpan | ||
* the offsets. | ||
*/ | ||
public void trim(String aText, int[] aSpan) | ||
{ | ||
int begin = aSpan[0]; | ||
int end = aSpan[1] - 1; | ||
|
||
while ((begin < (aText.length() - 1)) && trimChar(aText.charAt(begin))) { | ||
begin++; | ||
} | ||
while ((end > 0) && trimChar(aText.charAt(end))) { | ||
end--; | ||
} | ||
end++; | ||
|
||
aSpan[0] = begin; | ||
aSpan[1] = end; | ||
} | ||
|
||
public boolean isEmpty(int aBegin, int aEnd) | ||
{ | ||
return aBegin >= aEnd; | ||
} | ||
|
||
public boolean trimChar(final char aChar) | ||
{ | ||
switch (aChar) { | ||
case '\n': | ||
return true; // Line break | ||
case '\r': | ||
return true; // Carriage return | ||
case '\t': | ||
return true; // Tab | ||
case '\u200E': | ||
return true; // LEFT-TO-RIGHT MARK | ||
case '\u200F': | ||
return true; // RIGHT-TO-LEFT MARK | ||
case '\u2028': | ||
return true; // LINE SEPARATOR | ||
case '\u2029': | ||
return true; // PARAGRAPH SEPARATOR | ||
default: | ||
return Character.isWhitespace(aChar); | ||
} | ||
} | ||
} |
60 changes: 60 additions & 0 deletions
60
...t/src/main/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedTextFormatSupport.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
/* | ||
* Licensed to the Technische Universität Darmstadt under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The Technische Universität Darmstadt | ||
* licenses this file to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.tudarmstadt.ukp.clarin.webanno.text; | ||
|
||
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; | ||
|
||
import org.apache.uima.collection.CollectionReaderDescription; | ||
import org.apache.uima.resource.ResourceInitializationException; | ||
import org.apache.uima.resource.metadata.TypeSystemDescription; | ||
import org.springframework.stereotype.Component; | ||
|
||
import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; | ||
|
||
@Component | ||
public class PretokenizedTextFormatSupport | ||
implements FormatSupport | ||
{ | ||
public static final String ID = "pretokenized-textlines"; | ||
public static final String NAME = "Plain text (space-separated tokens, one sentence per line)"; | ||
|
||
@Override | ||
public String getId() | ||
{ | ||
return ID; | ||
} | ||
|
||
@Override | ||
public String getName() | ||
{ | ||
return NAME; | ||
} | ||
|
||
@Override | ||
public boolean isReadable() | ||
{ | ||
return true; | ||
} | ||
|
||
@Override | ||
public CollectionReaderDescription getReaderDescription(TypeSystemDescription aTSD) | ||
throws ResourceInitializationException | ||
{ | ||
return createReaderDescription(PretokenizedLineOrientedTextReader.class, aTSD); | ||
} | ||
} |
88 changes: 88 additions & 0 deletions
88
...t/java/de/tudarmstadt/ukp/clarin/webanno/text/PretokenizedLineOrientedTextReaderTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
/* | ||
* Licensed to the Technische Universität Darmstadt under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The Technische Universität Darmstadt | ||
* licenses this file to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package de.tudarmstadt.ukp.clarin.webanno.text; | ||
|
||
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader; | ||
import static org.apache.uima.fit.util.JCasUtil.select; | ||
import static org.assertj.core.api.Assertions.assertThat; | ||
import static org.junit.Assert.assertEquals; | ||
|
||
import org.apache.uima.collection.CollectionReader; | ||
import org.apache.uima.fit.factory.JCasFactory; | ||
import org.apache.uima.jcas.JCas; | ||
import org.junit.Test; | ||
|
||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; | ||
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; | ||
|
||
public class PretokenizedLineOrientedTextReaderTest | ||
{ | ||
@Test | ||
public void test() throws Exception | ||
{ | ||
JCas doc = JCasFactory.createJCas(); | ||
|
||
CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, | ||
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, "LICENSE.txt"); | ||
|
||
reader.getNext(doc.getCas()); | ||
|
||
assertEquals(169, select(doc, Sentence.class).size()); | ||
assertEquals(1581, select(doc, Token.class).size()); | ||
} | ||
|
||
@Test | ||
public void testSpaceSeparatedNoFinalLineBreak() throws Exception | ||
{ | ||
JCas doc = JCasFactory.createJCas(); | ||
|
||
CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, | ||
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, | ||
"src/test/resources/text/space-separated-no-final-line-break.txt"); | ||
|
||
reader.getNext(doc.getCas()); | ||
|
||
assertThat(select(doc, Sentence.class)) // | ||
.extracting(s -> s.getCoveredText()) // | ||
.containsExactly("1 2 3", "4 5 6", "7 8 9"); | ||
|
||
assertThat(select(doc, Token.class)) // | ||
.extracting(t -> t.getCoveredText()) // | ||
.containsExactly("1", "2", "3", "4", "5", "6", "7", "8", "9"); | ||
} | ||
|
||
@Test | ||
public void testTextWithPunctuation() throws Exception | ||
{ | ||
JCas doc = JCasFactory.createJCas(); | ||
|
||
CollectionReader reader = createReader(PretokenizedLineOrientedTextReader.class, | ||
PretokenizedLineOrientedTextReader.PARAM_SOURCE_LOCATION, | ||
"src/test/resources/text/text-with-punctuation.txt"); | ||
|
||
reader.getNext(doc.getCas()); | ||
|
||
assertThat(select(doc, Sentence.class)) // | ||
.extracting(s -> s.getCoveredText()) // | ||
.containsExactly("Yesterday, I left the house."); | ||
|
||
assertThat(select(doc, Token.class)) // | ||
.extracting(t -> t.getCoveredText()) // | ||
.containsExactly("Yesterday,", "I", "left", "the", "house."); | ||
} | ||
} |
3 changes: 3 additions & 0 deletions
3
webanno-io-text/src/test/resources/text/space-separated-no-final-line-break.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
1 2 3 | ||
4 5 6 | ||
7 8 9 |
1 change: 1 addition & 0 deletions
1
webanno-io-text/src/test/resources/text/text-with-punctuation.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Yesterday, I left the house. |