From 9e5bf02b9854e6b3f68cd1ee98df421ffaaecddb Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Sun, 1 Dec 2019 15:08:56 +0100 Subject: [PATCH] #1535 - Export to CoNLL 2012 format has duplicate (V*) tags - Upgrading to new DKPro Core version which has this issue fixed --- pom.xml | 2 +- .../api/dao/ImportExportServiceImpl.java | 2 +- webanno-io-conll/NOTICE.txt | 3 - .../conll/ConllCoreNlpFormatSupport.java | 2 + .../webanno/conll/ConllCoreNlpReader.java | 382 --------------- .../webanno/conll/ConllCoreNlpWriter.java | 234 --------- .../webanno/conll/ConllUFormatSupport.java | 2 + .../clarin/webanno/conll/ConllUReader.java | 453 ------------------ .../clarin/webanno/conll/ConllUWriter.java | 302 ------------ .../sequencecodec/AdjacentLabelCodec.java | 156 ------ .../conll/sequencecodec/SequenceCodec.java | 27 -- .../conll/sequencecodec/SequenceItem.java | 99 ---- .../conll/ConllCoreNlpReaderWriterTest.java | 66 --- .../webanno/conll/ConllUReaderTest.java | 80 ---- .../webanno/conll/ConllUReaderWriterTest.java | 121 ----- .../webanno/conll/ConllUWriterTest.java | 68 --- .../sequencecodec/AdjacentLabelCodecTest.java | 175 ------- .../conll/sequencecodec/SequenceItemTest.java | 51 -- 18 files changed, 6 insertions(+), 2219 deletions(-) delete mode 100644 webanno-io-conll/NOTICE.txt delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReader.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpWriter.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReader.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriter.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodec.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceCodec.java delete mode 100644 webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItem.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReaderWriterTest.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderTest.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderWriterTest.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriterTest.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodecTest.java delete mode 100644 webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItemTest.java diff --git a/pom.xml b/pom.xml index 22c042b6e95..29af0bed347 100644 --- a/pom.xml +++ b/pom.xml @@ -72,7 +72,7 @@ 2.1.7 5.3.10.Final yyyy-MM-dd HH:mm - 1.11.0 + 1.12.0 2.9.2 1.9.1 20180813 diff --git a/webanno-api-dao/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/dao/ImportExportServiceImpl.java b/webanno-api-dao/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/dao/ImportExportServiceImpl.java index 5fcb9a897b9..10f1e5c5988 100644 --- a/webanno-api-dao/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/dao/ImportExportServiceImpl.java +++ b/webanno-api-dao/src/main/java/de/tudarmstadt/ukp/clarin/webanno/api/dao/ImportExportServiceImpl.java @@ -427,7 +427,7 @@ else if (!feature.getLayer().getType().equals(WebAnnoConst.CHAIN_TYPE)) { cas); ConfigurationParameterFactory.addConfigurationParameters(writer, JCasFileWriter_ImplBase.PARAM_USE_DOCUMENT_ID, true, - JCasFileWriter_ImplBase.PARAM_ESCAPE_DOCUMENT_ID, false, + JCasFileWriter_ImplBase.PARAM_ESCAPE_FILENAME, false, JCasFileWriter_ImplBase.PARAM_TARGET_LOCATION, exportTempDir, JCasFileWriter_ImplBase.PARAM_STRIP_EXTENSION, aStripExtension); diff --git a/webanno-io-conll/NOTICE.txt b/webanno-io-conll/NOTICE.txt deleted file mode 100644 index d2df625cb13..00000000000 --- a/webanno-io-conll/NOTICE.txt +++ /dev/null @@ -1,3 +0,0 @@ -This module contains CoNLL support backported from an unreleased DKPro Core version and is meant to -be dropped again and replaced by the proper DKPro Core CoNLL module when it has been released -and when WebAnno is able to depend on this version of DKPro Core. diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpFormatSupport.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpFormatSupport.java index 54a404772fe..a43699aca0d 100644 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpFormatSupport.java +++ b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpFormatSupport.java @@ -24,6 +24,8 @@ import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.conll.ConllCoreNlpReader; +import org.dkpro.core.io.conll.ConllCoreNlpWriter; import org.springframework.stereotype.Component; import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReader.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReader.java deleted file mode 100644 index 8662c304a3d..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReader.java +++ /dev/null @@ -1,382 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.fit.util.FSUtil; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.resources.CompressionUtils; -import org.dkpro.core.api.resources.MappingProvider; -import org.dkpro.core.api.resources.MappingProviderFactory; - -import de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec.AdjacentLabelCodec; -import de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec.SequenceItem; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; - -/** - *

Reads a file in the default CoreNLP CoNLL format.

- * - * @see CoreNLP CoNLLOutputter - */ -@ResourceMetaData(name = "CoNLL CoreNLP Reader") -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllCoreNlpReader - extends JCasResourceCollectionReader_ImplBase -{ - /** - * Character encoding of the input data. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, defaultValue = "UTF-8") - private String sourceEncoding; - - /** - * Read fine-grained part-of-speech information. - */ - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean readPos; - - /** - * Enable to use CPOS (column 4) as the part-of-speech tag. Otherwise the POS (column 3) is - * used. - */ - public static final String PARAM_USE_CPOS_AS_POS = "useCPosAsPos"; - @ConfigurationParameter(name = PARAM_USE_CPOS_AS_POS, mandatory = true, defaultValue = "false") - private boolean useCPosAsPos; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = - ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Location of the mapping file for named entity tags to UIMA types. - */ - public static final String PARAM_NAMED_ENTITY_MAPPING_LOCATION = - ComponentParameters.PARAM_NAMED_ENTITY_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_NAMED_ENTITY_MAPPING_LOCATION, mandatory = false) - private String namedEntityMappingLocation; - - /** - * Read morphological features. - */ - public static final String PARAM_READ_NAMED_ENTITY = - ComponentParameters.PARAM_READ_NAMED_ENTITY; - @ConfigurationParameter(name = PARAM_READ_NAMED_ENTITY, mandatory = true, defaultValue = "true") - private boolean readNer; - - /** - * Read lemma information. - */ - public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; - @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") - private boolean readLemma; - - /** - * Read syntactic dependency information. - */ - public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; - @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean readDependency; - - private static final String UNUSED = "_"; - - private static final int ID = 0; - private static final int FORM = 1; - private static final int LEMMA = 2; - private static final int POSTAG = 3; - private static final int NER = 4; - private static final int HEAD = 5; - private static final int DEPREL = 6; - - private MappingProvider posMappingProvider; - private MappingProvider namedEntityMappingProvider; - - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMappingProvider = MappingProviderFactory.createPosMappingProvider(this, - posMappingLocation, posTagset, getLanguage()); - - namedEntityMappingProvider = new MappingProvider(); - namedEntityMappingProvider.setDefault(MappingProvider.LOCATION, - "classpath:/there/is/no/mapping/yet"); - namedEntityMappingProvider.setDefault(MappingProvider.BASE_TYPE, - NamedEntity.class.getName()); - namedEntityMappingProvider.setOverride(MappingProvider.LOCATION, - namedEntityMappingLocation); - namedEntityMappingProvider.setOverride(MappingProvider.LANGUAGE, getLanguage()); - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), - sourceEncoding)); - convert(aJCas, reader); - } - finally { - closeQuietly(reader); - } - } - - public void convert(JCas aJCas, BufferedReader aReader) - throws IOException - { - if (readPos) { - try { - posMappingProvider.configure(aJCas.getCas()); - } - catch (AnalysisEngineProcessException e) { - throw new IOException(e); - } - } - - if (readNer) { - try { - namedEntityMappingProvider.configure(aJCas.getCas()); - } - catch (AnalysisEngineProcessException e) { - throw new IOException(e); - } - } - - JCasBuilder doc = new JCasBuilder(aJCas); - - List words; - while ((words = readSentence(aReader)) != null) { - if (words.isEmpty()) { - // Ignore empty sentences. This can happen when there are multiple end-of-sentence - // markers following each other. - continue; - } - - int sentenceBegin = doc.getPosition(); - int sentenceEnd = sentenceBegin; - - // Tokens, Lemma, POS - Map tokens = new HashMap(); - Iterator wordIterator = words.iterator(); - while (wordIterator.hasNext()) { - String[] word = wordIterator.next(); - // Read token - Token token = doc.add(word[FORM], Token.class); - tokens.put(Integer.valueOf(word[ID]), token); - if (wordIterator.hasNext()) { - doc.add(" "); - } - - // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); - lemma.addToIndexes(); - token.setLemma(lemma); - } - - // Read part-of-speech tag - String tag = word[POSTAG]; - if (!UNUSED.equals(tag) && readPos) { - Type posTag = posMappingProvider.getTagType(tag); - POS pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), - token.getEnd()); - pos.setPosValue(tag != null ? tag.intern() : null); - pos.addToIndexes(); - token.setPos(pos); - } - - sentenceEnd = token.getEnd(); - } - - // Read named entities - if (readNer) { - List encodedNerSpans = words.stream().map(w -> { - int id = Integer.valueOf(w[ID]); - return new SequenceItem(id, id, w[NER]); - }).collect(Collectors.toList()); - - AdjacentLabelCodec codec = new AdjacentLabelCodec(1); - List decodedNerSpans = codec.decode(encodedNerSpans); - - for (SequenceItem nerSpan : decodedNerSpans) { - Type nerType = namedEntityMappingProvider.getTagType(nerSpan.getLabel()); - Token beginToken = tokens.get(nerSpan.getBegin()); - Token endToken = tokens.get(nerSpan.getEnd()); - NamedEntity ne = (NamedEntity) aJCas.getCas().createAnnotation(nerType, - beginToken.getBegin(), endToken.getEnd()); - ne.setValue(nerSpan.getLabel()); - ne.addToIndexes(); - } - } - - // Read dependencies - if (readDependency) { - for (String[] word : words) { - if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); - - // Model the root as a loop onto itself - if (govId == 0) { - // Not using ROOT here because WebAnno cannot deal with elevated - // types - Dependency rel = new Dependency(aJCas); - rel.setGovernor(tokens.get(depId)); - rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); - rel.setBegin(rel.getDependent().getBegin()); - rel.setEnd(rel.getDependent().getEnd()); - // This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas - // classes - FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC); - rel.addToIndexes(); - } - else { - Dependency rel = new Dependency(aJCas); - rel.setGovernor(tokens.get(govId)); - rel.setDependent(tokens.get(depId)); - rel.setDependencyType(word[DEPREL]); - rel.setBegin(rel.getDependent().getBegin()); - rel.setEnd(rel.getDependent().getEnd()); - // This is set via FSUtil because we still use the DKPro Core 1.7.0 JCas - // classes - FSUtil.setFeature(rel, "flavor", DependencyFlavor.BASIC); - rel.addToIndexes(); - } - } - } - } - - // Sentence - Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); - sentence.addToIndexes(); - - // Once sentence per line. - doc.add("\n"); - } - - doc.close(); - } - - /** - * Read a single sentence. - */ - private static List readSentence(BufferedReader aReader) - throws IOException - { - List words = new ArrayList(); - String line; - boolean firstLineOfSentence = true; - while ((line = aReader.readLine()) != null) { - if (StringUtils.isBlank(line)) { - firstLineOfSentence = true; - break; // End of sentence - } - - if (line.startsWith("<") && line.endsWith(">")) { - // FinnTreeBank uses pseudo-XML to attach extra metadata to sentences. - // Currently, we just ignore this. - break; // Consider end of sentence - } - - if (firstLineOfSentence && line.startsWith("#")) { - // GUM uses a comment to attach extra metadata to sentences. - // Currently, we just ignore this. - break; // Consider end of sentence - } - - firstLineOfSentence = false; - - String[] fields = line.split("\t"); - if (fields.length != 7) { - throw new IOException( - "Invalid file format. Line needs to have 7 tab-separated fields, but it has " - + fields.length + ": [" + line + "]"); - } - words.add(fields); - } - - if (line == null && words.isEmpty()) { - return null; - } - else { - return words; - } - } -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpWriter.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpWriter.java deleted file mode 100644 index 72632248d8e..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpWriter.java +++ /dev/null @@ -1,234 +0,0 @@ -/* - * Copyright 2012 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.NavigableMap; -import java.util.TreeMap; -import java.util.stream.Collectors; - -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.util.FSUtil; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.api.io.JCasFileWriter_ImplBase; -import org.dkpro.core.api.parameter.ComponentParameters; - -import de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec.AdjacentLabelCodec; -import de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec.SequenceItem; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; - -/** - *

Writes a file in the default CoreNLP CoNLL format.

- * - * @see CoreNLP CoNLLOutputter - */ -@ResourceMetaData(name = "CoNLL CoreNLP Reader") -@TypeCapability(inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllCoreNlpWriter - extends JCasFileWriter_ImplBase -{ - private static final String UNUSED = "_"; - private static final int UNUSED_INT = -1; - - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, defaultValue = "UTF-8") - private String targetEncoding; - - /** - * Use this filename extension. - */ - public static final String PARAM_FILENAME_SUFFIX = "filenameSuffix"; - @ConfigurationParameter(name = PARAM_FILENAME_SUFFIX, mandatory = true, defaultValue = ".conll") - private String filenameSuffix; - - /** - * Write fine-grained part-of-speech information. - */ - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") - private boolean writePos; - - /** - * Write named entity information. - */ - public static final String PARAM_WRITE_NAMED_ENTITY = - ComponentParameters.PARAM_WRITE_NAMED_ENTITY; - @ConfigurationParameter(name = PARAM_WRITE_NAMED_ENTITY, mandatory = true, defaultValue = "true") - private boolean writeNamedEntity; - - /** - * Write lemma information. - */ - public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; - @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") - private boolean writeLemma; - - /** - * Write syntactic dependency information. - */ - public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; - @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean writeDependency; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - PrintWriter out = null; - try { - out = new PrintWriter(new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), - targetEncoding)); - convert(aJCas, out); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - finally { - closeQuietly(out); - } - } - - private void convert(JCas aJCas, PrintWriter aOut) - { - for (Sentence sentence : select(aJCas, Sentence.class)) { - Map ctokens = new LinkedHashMap<>(); - NavigableMap tokenBeginIndex = new TreeMap<>(); - NavigableMap tokenEndIndex = new TreeMap<>(); - - // Tokens - List tokens = selectCovered(Token.class, sentence); - - for (int i = 0; i < tokens.size(); i++) { - Row row = new Row(); - row.id = i + 1; - row.token = tokens.get(i); - ctokens.put(row.token, row); - tokenBeginIndex.put(row.token.getBegin(), row.token); - tokenEndIndex.put(row.token.getEnd(), row.token); - } - - // Dependencies - List basicDeps = selectCovered(Dependency.class, sentence).stream() - .filter(dep -> { - String flavor = FSUtil.getFeature(dep, "flavor", String.class); - return flavor == null || DependencyFlavor.BASIC.equals(flavor); - }) - .collect(Collectors.toList()); - for (Dependency rel : basicDeps) { - Row row = ctokens.get(rel.getDependent()); - if (row.deprel != null) { - String form = row.token.getCoveredText(); - throw new IllegalStateException("Illegal basic dependency structure - token [" - + form - + "] is dependent of more than one dependency."); - } - row.deprel = rel; - } - - // Named entities - List nerSpans = new ArrayList<>(); - for (NamedEntity ne : selectCovered(NamedEntity.class, sentence)) { - Token beginToken = tokenBeginIndex.floorEntry(ne.getBegin()).getValue(); - Token endToken = tokenEndIndex.ceilingEntry(ne.getEnd()).getValue(); - nerSpans.add(new SequenceItem(ctokens.get(beginToken).id, ctokens.get(endToken).id, - ne.getValue())); - } - AdjacentLabelCodec codec = new AdjacentLabelCodec(1); - List encodedNe = codec.encode(nerSpans, tokens.size()); - for (int i = 0; i < encodedNe.size(); i++) { - ctokens.get(tokens.get(i)).ne = encodedNe.get(i).getLabel(); - } - - // Write sentence - for (Row row : ctokens.values()) { - String form = row.token.getCoveredText(); - String lemma = UNUSED; - if (writeLemma && (row.token.getLemma() != null)) { - lemma = row.token.getLemma().getValue(); - } - - String pos = UNUSED; - if (writePos && (row.token.getPos() != null)) { - POS posAnno = row.token.getPos(); - pos = posAnno.getPosValue(); - } - - int headId = UNUSED_INT; - String deprel = UNUSED; - if (writeDependency && (row.deprel != null)) { - deprel = row.deprel.getDependencyType(); - headId = ctokens.get(row.deprel.getGovernor()).id; - if (headId == row.id) { - // ROOT dependencies may be modeled as a loop, ignore these. - headId = 0; - } - } - - String head = UNUSED; - if (headId != UNUSED_INT) { - head = Integer.toString(headId); - } - - String ner = UNUSED; - if (writeNamedEntity && (row.ne != null)) { - ner = row.ne; - } - - aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, form, lemma, pos, ner, head, - deprel); - } - - aOut.println(); - } - } - - private static final class Row - { - int id; - Token token; - String ne; - Dependency deprel; - } -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUFormatSupport.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUFormatSupport.java index b386fd8b961..07e8753052b 100644 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUFormatSupport.java +++ b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUFormatSupport.java @@ -24,6 +24,8 @@ import org.apache.uima.cas.CAS; import org.apache.uima.collection.CollectionReaderDescription; import org.apache.uima.resource.ResourceInitializationException; +import org.dkpro.core.io.conll.ConllUReader; +import org.dkpro.core.io.conll.ConllUWriter; import org.springframework.stereotype.Component; import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReader.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReader.java deleted file mode 100644 index 17b659422d3..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReader.java +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.commons.io.IOUtils.closeQuietly; -import static org.dkpro.core.api.resources.MappingProviderFactory.createPosMappingProvider; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.UimaContext; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.cas.Feature; -import org.apache.uima.cas.Type; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.fit.factory.JCasBuilder; -import org.apache.uima.jcas.JCas; -import org.apache.uima.resource.ResourceInitializationException; -import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.parameter.MimeTypes; -import org.dkpro.core.api.resources.CompressionUtils; -import org.dkpro.core.api.resources.MappingProvider; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; - -/** - * Reads a file in the CoNLL-U format. - * - * @see CoNLL-U Format - */ -@ResourceMetaData(name = "CoNLL-U Reader") -@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) -@TypeCapability( - outputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllUReader - extends JCasResourceCollectionReader_ImplBase -{ - /** - * Character encoding of the input data. - */ - public static final String PARAM_SOURCE_ENCODING = ComponentParameters.PARAM_SOURCE_ENCODING; - @ConfigurationParameter(name = PARAM_SOURCE_ENCODING, mandatory = true, - defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String sourceEncoding; - - /** - * Read fine-grained part-of-speech information. - */ - public static final String PARAM_READ_POS = ComponentParameters.PARAM_READ_POS; - @ConfigurationParameter(name = PARAM_READ_POS, mandatory = true, defaultValue = "true") - private boolean readPos; - - /** - * Read coarse-grained part-of-speech information. - */ - public static final String PARAM_READ_CPOS = ComponentParameters.PARAM_READ_CPOS; - @ConfigurationParameter(name = PARAM_READ_CPOS, mandatory = true, defaultValue = "true") - private boolean readCPos; - - /** - * Treat coarse-grained part-of-speech as fine-grained part-of-speech information. - */ - public static final String PARAM_USE_CPOS_AS_POS = "useCPosAsPos"; - @ConfigurationParameter(name = PARAM_USE_CPOS_AS_POS, mandatory = true, defaultValue = "false") - private boolean useCPosAsPos; - - /** - * Use this part-of-speech tag set to use to resolve the tag set mapping instead of using the - * tag set defined as part of the model meta data. This can be useful if a custom model is - * specified which does not have such meta data, or it can be used in readers. - */ - public static final String PARAM_POS_TAG_SET = ComponentParameters.PARAM_POS_TAG_SET; - @ConfigurationParameter(name = PARAM_POS_TAG_SET, mandatory = false) - protected String posTagset; - - /** - * Load the part-of-speech tag to UIMA type mapping from this location instead of locating - * the mapping automatically. - */ - public static final String PARAM_POS_MAPPING_LOCATION = - ComponentParameters.PARAM_POS_MAPPING_LOCATION; - @ConfigurationParameter(name = PARAM_POS_MAPPING_LOCATION, mandatory = false) - protected String posMappingLocation; - - /** - * Read morphological features. - */ - public static final String PARAM_READ_MORPH = ComponentParameters.PARAM_READ_MORPH; - @ConfigurationParameter(name = PARAM_READ_MORPH, mandatory = true, defaultValue = "true") - private boolean readMorph; - - /** - * Read lemma information. - */ - public static final String PARAM_READ_LEMMA = ComponentParameters.PARAM_READ_LEMMA; - @ConfigurationParameter(name = PARAM_READ_LEMMA, mandatory = true, defaultValue = "true") - private boolean readLemma; - - /** - * Read syntactic dependency information. - */ - public static final String PARAM_READ_DEPENDENCY = ComponentParameters.PARAM_READ_DEPENDENCY; - @ConfigurationParameter(name = PARAM_READ_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean readDependency; - - private static final String UNUSED = "_"; - - private static final int ID = 0; - private static final int FORM = 1; - private static final int LEMMA = 2; - private static final int CPOSTAG = 3; - private static final int POSTAG = 4; - private static final int FEATS = 5; - private static final int HEAD = 6; - private static final int DEPREL = 7; - private static final int DEPS = 8; - private static final int MISC = 9; - - public static final String META_SEND_ID = "sent_id"; - public static final String META_TEXT = "text"; - - private MappingProvider posMappingProvider; - - @Override - public void initialize(UimaContext aContext) - throws ResourceInitializationException - { - super.initialize(aContext); - - posMappingProvider = createPosMappingProvider(this, posMappingLocation, posTagset, - getLanguage()); - } - - @Override - public void getNext(JCas aJCas) - throws IOException, CollectionException - { - Resource res = nextFile(); - initCas(aJCas, res); - BufferedReader reader = null; - try { - reader = new BufferedReader(new InputStreamReader( - CompressionUtils.getInputStream(res.getLocation(), res.getInputStream()), - sourceEncoding)); - convert(aJCas, reader); - } - finally { - closeQuietly(reader); - } - } - - public void convert(JCas aJCas, BufferedReader aReader) - throws IOException - { - if (readPos) { - try { - posMappingProvider.configure(aJCas.getCas()); - } - catch (AnalysisEngineProcessException e) { - throw new IOException(e); - } - } - - JCasBuilder doc = new JCasBuilder(aJCas); - - while (true) { - // Read sentence comments (if any) - Map comments = readSentenceComments(aReader); - - // Read sentence - List words = readSentence(aReader); - if (words == null) { - // End of file - break; - } - - if (words.isEmpty()) { - // Ignore empty sentences. This can happen when there are multiple end-of-sentence - // markers following each other. - continue; - } - - int sentenceBegin = doc.getPosition(); - int sentenceEnd = sentenceBegin; - - int surfaceBegin = -1; - int surfaceEnd = -1; - String surfaceString = null; - - // Tokens, Lemma, POS - Int2ObjectMap tokens = new Int2ObjectOpenHashMap<>(); - Iterator wordIterator = words.iterator(); - while (wordIterator.hasNext()) { - String[] word = wordIterator.next(); - if (word[ID].contains("-")) { - String[] fragments = word[ID].split("-"); - surfaceBegin = Integer.valueOf(fragments[0]); - surfaceEnd = Integer.valueOf(fragments[1]); - surfaceString = word[FORM]; - continue; - } - - // Read token - int tokenIdx = Integer.valueOf(word[ID]); - Token token = doc.add(word[FORM], Token.class); - tokens.put(tokenIdx, token); - if (!StringUtils.contains(word[MISC], "SpaceAfter=No") && wordIterator.hasNext()) { - doc.add(" "); - } - - // Read lemma - if (!UNUSED.equals(word[LEMMA]) && readLemma) { - Lemma lemma = new Lemma(aJCas, token.getBegin(), token.getEnd()); - lemma.setValue(word[LEMMA]); - lemma.addToIndexes(); - token.setLemma(lemma); - } - - // Read part-of-speech tag - POS pos = null; - String tag = useCPosAsPos ? word[CPOSTAG] : word[POSTAG]; - if (!UNUSED.equals(tag) && readPos) { - Type posTag = posMappingProvider.getTagType(tag); - pos = (POS) aJCas.getCas().createAnnotation(posTag, token.getBegin(), - token.getEnd()); - pos.setPosValue(tag != null ? tag.intern() : null); - } - - // Read coarse part-of-speech tag - if (!UNUSED.equals(word[CPOSTAG]) && readCPos) { - if (pos == null) { - pos = new POS(aJCas, token.getBegin(), token.getEnd()); - } - pos.setCoarseValue(word[CPOSTAG].intern()); - } - - if (pos != null) { - pos.addToIndexes(); - token.setPos(pos); - } - - // Read morphological features - if (!UNUSED.equals(word[FEATS]) && readMorph) { - MorphologicalFeatures morphtag = new MorphologicalFeatures(aJCas, - token.getBegin(), token.getEnd()); - morphtag.setValue(word[FEATS]); - morphtag.addToIndexes(); - token.setMorph(morphtag); - - // Try parsing out individual feature values. Since the DKPro Core - // MorphologicalFeatures type is based on the definition from the UD project, - // we can do this rather straightforwardly. - Type morphType = morphtag.getType(); - String[] items = word[FEATS].split("\\|"); - for (String item : items) { - String[] keyValue = item.split("="); - StringBuilder key = new StringBuilder(keyValue[0]); - key.setCharAt(0, Character.toLowerCase(key.charAt(0))); - String value = keyValue[1]; - - Feature feat = morphType.getFeatureByBaseName(key.toString()); - if (feat != null) { - morphtag.setStringValue(feat, value); - } - } - } - - // Read surface form - if (tokenIdx == surfaceEnd) { - int begin = tokens.get(surfaceBegin).getBegin(); - int end = tokens.get(surfaceEnd).getEnd(); - SurfaceForm surfaceForm = new SurfaceForm(aJCas, begin, end); - surfaceForm.setValue(surfaceString); - surfaceForm.addToIndexes(); - surfaceBegin = -1; - surfaceEnd = -1; - surfaceString = null; - } - - sentenceEnd = token.getEnd(); - } - - // Dependencies - if (readDependency) { - for (String[] word : words) { - if (!UNUSED.equals(word[DEPREL])) { - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(word[HEAD]); - - // Model the root as a loop onto itself - makeDependency(aJCas, govId, depId, word[DEPREL], DependencyFlavor.BASIC, - tokens, word); - } - - if (!UNUSED.equals(word[DEPS])) { - // list items separated by vertical bar - String[] items = word[DEPS].split("\\|"); - for (String item : items) { - String[] sItem = item.split(":"); - - int depId = Integer.valueOf(word[ID]); - int govId = Integer.valueOf(sItem[0]); - - makeDependency(aJCas, govId, depId, sItem[1], DependencyFlavor.ENHANCED, - tokens, word); - } - } - } - } - - // Sentence - Sentence sentence = new Sentence(aJCas, sentenceBegin, sentenceEnd); - sentence.setId(comments.get(META_SEND_ID)); - sentence.addToIndexes(); - - // Once sentence per line. - doc.add("\n"); - } - - doc.close(); - } - - private Dependency makeDependency(JCas aJCas, int govId, int depId, String label, String flavor, - Int2ObjectMap tokens, String[] word) - { - Dependency rel; - - if (govId == 0) { - rel = new ROOT(aJCas); - rel.setGovernor(tokens.get(depId)); - rel.setDependent(tokens.get(depId)); - } - else { - rel = new Dependency(aJCas); - rel.setGovernor(tokens.get(govId)); - rel.setDependent(tokens.get(depId)); - } - - rel.setDependencyType(label); - rel.setFlavor(flavor); - rel.setBegin(rel.getDependent().getBegin()); - rel.setEnd(rel.getDependent().getEnd()); - rel.addToIndexes(); - - return rel; - } - - private Map readSentenceComments(BufferedReader aReader) - throws IOException - { - Map comments = new LinkedHashMap<>(); - - while (true) { - // Check if the next line could be a header line - aReader.mark(2); - char character = (char) aReader.read(); - if ('#' == character) { - // Read the rest of the line - String line = aReader.readLine(); - if (line.contains("=")) { - String[] parts = line.split("=", 2); - comments.put(parts[0].trim(), parts[1].trim()); - } - else { - // Comment or unknown header line - } - } - else { - aReader.reset(); - break; - } - } - - return comments; - } - - /** - * Read a single sentence. - */ - private static List readSentence(BufferedReader aReader) - throws IOException - { - List words = new ArrayList<>(); - String line; - while ((line = aReader.readLine()) != null) { - if (StringUtils.isBlank(line)) { - break; // End of sentence - } - if (line.startsWith("#")) { - // Comment line - continue; - } - String[] fields = line.split("\t"); - if (fields.length != 10) { - throw new IOException( - "Invalid file format. Line needs to have 10 tab-separated fields, but it has " - + fields.length + ": [" + line + "]"); - } - words.add(fields); - } - - if (line == null && words.isEmpty()) { - return null; - } - else { - return words; - } - } -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriter.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriter.java deleted file mode 100644 index 64797b2ef61..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriter.java +++ /dev/null @@ -1,302 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.uima.fit.util.JCasUtil.indexCovered; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.apache.uima.fit.util.JCasUtil.selectCovered; - -import java.io.OutputStreamWriter; -import java.io.PrintWriter; -import java.util.ArrayList; -import java.util.Collection; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; -import org.apache.uima.analysis_engine.AnalysisEngineProcessException; -import org.apache.uima.fit.descriptor.ConfigurationParameter; -import org.apache.uima.fit.descriptor.MimeTypeCapability; -import org.apache.uima.fit.descriptor.ResourceMetaData; -import org.apache.uima.fit.descriptor.TypeCapability; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.api.io.JCasFileWriter_ImplBase; -import org.dkpro.core.api.parameter.ComponentParameters; -import org.dkpro.core.api.parameter.MimeTypes; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.SurfaceForm; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; -import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.DependencyFlavor; -import it.unimi.dsi.fastutil.ints.Int2ObjectMap; -import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap; - -/** - * Writes a file in the CoNLL-U format. - * - * @see CoNLL-U Format - */ -@ResourceMetaData(name = "CoNLL-U Writer") -@MimeTypeCapability({MimeTypes.TEXT_X_CONLL_U}) -@TypeCapability( - inputs = { - "de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures", - "de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS", - "de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma", - "de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency" }) -public class ConllUWriter - extends JCasFileWriter_ImplBase -{ - private static final String UNUSED = "_"; - private static final int UNUSED_INT = -1; - - /** - * Character encoding of the output data. - */ - public static final String PARAM_TARGET_ENCODING = ComponentParameters.PARAM_TARGET_ENCODING; - @ConfigurationParameter(name = PARAM_TARGET_ENCODING, mandatory = true, - defaultValue = ComponentParameters.DEFAULT_ENCODING) - private String targetEncoding; - - /** - * Use this filename extension. - */ - public static final String PARAM_FILENAME_EXTENSION = - ComponentParameters.PARAM_FILENAME_EXTENSION; - @ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".conll") - private String filenameSuffix; - - /** - * Write fine-grained part-of-speech information. - */ - public static final String PARAM_WRITE_POS = ComponentParameters.PARAM_WRITE_POS; - @ConfigurationParameter(name = PARAM_WRITE_POS, mandatory = true, defaultValue = "true") - private boolean writePos; - - /** - * Write coarse-grained part-of-speech information. - */ - public static final String PARAM_WRITE_CPOS = ComponentParameters.PARAM_WRITE_CPOS; - @ConfigurationParameter(name = PARAM_WRITE_CPOS, mandatory = true, defaultValue = "true") - private boolean writeCPos; - - /** - * Write morphological features. - */ - public static final String PARAM_WRITE_MORPH = ComponentParameters.PARAM_WRITE_MORPH; - @ConfigurationParameter(name = PARAM_WRITE_MORPH, mandatory = true, defaultValue = "true") - private boolean writeMorph; - - /** - * Write lemma information. - */ - public static final String PARAM_WRITE_LEMMA = ComponentParameters.PARAM_WRITE_LEMMA; - @ConfigurationParameter(name = PARAM_WRITE_LEMMA, mandatory = true, defaultValue = "true") - private boolean writeLemma; - - /** - * Write syntactic dependency information. - */ - public static final String PARAM_WRITE_DEPENDENCY = ComponentParameters.PARAM_WRITE_DEPENDENCY; - @ConfigurationParameter(name = PARAM_WRITE_DEPENDENCY, mandatory = true, defaultValue = "true") - private boolean writeDependency; - - /** - * Write text covered by the token instead of the token form. - */ - public static final String PARAM_WRITE_COVERED_TEXT = - ComponentParameters.PARAM_WRITE_COVERED_TEXT; - @ConfigurationParameter(name = PARAM_WRITE_COVERED_TEXT, mandatory = true, defaultValue = "true") - private boolean writeCovered; - - /** - * Include the full sentence text as a comment in front of each sentence. - */ - public static final String PARAM_WRITE_TEXT_COMMENT = "writeTextComment"; - @ConfigurationParameter(name = PARAM_WRITE_TEXT_COMMENT, mandatory = true, defaultValue = "true") - private boolean writeTextHeader; - - @Override - public void process(JCas aJCas) - throws AnalysisEngineProcessException - { - try (PrintWriter out = new PrintWriter( - new OutputStreamWriter(getOutputStream(aJCas, filenameSuffix), targetEncoding));) { - convert(aJCas, out); - } - catch (Exception e) { - throw new AnalysisEngineProcessException(e); - } - } - - private void convert(JCas aJCas, PrintWriter aOut) - { - Map> surfaceIdx = indexCovered(aJCas, SurfaceForm.class, - Token.class); - Int2ObjectMap surfaceBeginIdx = new Int2ObjectOpenHashMap<>(); - for (SurfaceForm sf : select(aJCas, SurfaceForm.class)) { - surfaceBeginIdx.put(sf.getBegin(), sf); - } - - for (Sentence sentence : select(aJCas, Sentence.class)) { - Map ctokens = new LinkedHashMap<>(); - - // Comments - if (sentence.getId() != null) { - aOut.printf("# %s = %s\n", ConllUReader.META_SEND_ID, sentence.getId()); - } - if (writeTextHeader) { - String sentenceText = sentence.getCoveredText(); - // CoNLL-U does not support line breaks in the sentence text, so we need to replace - // such characters. - sentenceText = StringUtils.replaceChars(sentenceText, "\n\r", " "); - aOut.printf("# %s = %s\n", ConllUReader.META_TEXT, sentenceText); - } - - // Tokens - List tokens = selectCovered(Token.class, sentence); - - for (int i = 0; i < tokens.size(); i++) { - Row row = new Row(); - row.id = i + 1; - row.token = tokens.get(i); - row.noSpaceAfter = (i + 1 < tokens.size()) - && row.token.getEnd() == tokens.get(i + 1).getBegin(); - ctokens.put(row.token, row); - } - - // Dependencies - for (Dependency rel : selectCovered(Dependency.class, sentence)) { - if (StringUtils.isBlank(rel.getFlavor()) - || DependencyFlavor.BASIC.equals(rel.getFlavor())) { - ctokens.get(rel.getDependent()).deprel = rel; - } - else { - ctokens.get(rel.getDependent()).deps.add(rel); - } - } - - // Write sentence in CONLL-U format - for (Row row : ctokens.values()) { - - String form = row.token.getCoveredText(); - if (!writeCovered) { - form = row.token.getText(); - } - - String lemma = UNUSED; - if (writeLemma && (row.token.getLemma() != null)) { - lemma = row.token.getLemma().getValue(); - } - - String pos = UNUSED; - if (writePos && (row.token.getPos() != null) - && row.token.getPos().getPosValue() != null) { - POS posAnno = row.token.getPos(); - pos = posAnno.getPosValue(); - } - - String cpos = UNUSED; - if (writeCPos && (row.token.getPos() != null) - && row.token.getPos().getCoarseValue() != null) { - POS posAnno = row.token.getPos(); - cpos = posAnno.getCoarseValue(); - } - - int headId = UNUSED_INT; - String deprel = UNUSED; - String deps = UNUSED; - if (writeDependency) { - if ((row.deprel != null)) { - deprel = row.deprel.getDependencyType(); - headId = ctokens.get(row.deprel.getGovernor()).id; - if (headId == row.id) { - // ROOT dependencies may be modeled as a loop, ignore these. - headId = 0; - } - } - - StringBuilder depsBuf = new StringBuilder(); - for (Dependency d : row.deps) { - if (depsBuf.length() > 0) { - depsBuf.append('|'); - } - // Resolve self-looping root to 0-indexed root - int govId = ctokens.get(d.getGovernor()).id; - if (govId == row.id) { - govId = 0; - } - depsBuf.append(govId); - depsBuf.append(':'); - depsBuf.append(d.getDependencyType()); - } - if (depsBuf.length() > 0) { - deps = depsBuf.toString(); - } - } - - String head = UNUSED; - if (headId != UNUSED_INT) { - head = Integer.toString(headId); - } - - String feats = UNUSED; - if (writeMorph && (row.token.getMorph() != null)) { - feats = row.token.getMorph().getValue(); - } - - String misc = UNUSED; - if (row.noSpaceAfter) { - misc = "SpaceAfter=No"; - } - - SurfaceForm sf = surfaceBeginIdx.get(row.token.getBegin()); - if (sf != null) { - @SuppressWarnings({ "unchecked", "rawtypes" }) - List covered = (List) surfaceIdx.get(sf); - int id1 = ctokens.get(covered.get(0)).id; - int id2 = ctokens.get(covered.get(covered.size() - 1)).id; - aOut.printf("%d-%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", id1, id2, - sf.getValue(), UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, UNUSED, - UNUSED); - } - - aOut.printf("%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", row.id, - form, lemma, cpos, pos, feats, head, deprel, deps, - misc); - } - - aOut.println(); - } - } - - private static final class Row - { - int id; - Token token; - boolean noSpaceAfter; - Dependency deprel; - List deps = new ArrayList<>(); - } -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodec.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodec.java deleted file mode 100644 index 6057a6448db..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodec.java +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright 2018 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Optional; - -/** - * Sequence codec which treats encodes multi-unit items without a prefix. This means that during - * decoding, there is no way to tell if two adjacent sequence items with the same label belong to - * the same unit or not - they are always treated as belonging to the same unit. - */ -public class AdjacentLabelCodec - implements SequenceCodec -{ - private String markOut = "O"; - - private final int offset; - - public AdjacentLabelCodec() - { - this(1); - } - - public AdjacentLabelCodec(int aOffset) - { - offset = aOffset; - } - - @Override - public List decode(List aEncoded) - { - List decoded = new ArrayList<>(); - - Optional starter = Optional.empty(); - Optional previous = Optional.empty(); - - Iterator i = aEncoded.iterator(); - while (i.hasNext()) { - SequenceItem current = i.next(); - - // Sequence items may not overlap - if (previous.isPresent()) { - SequenceItem prev = previous.get(); - if (current.getBegin() < prev.getEnd() || prev.getEnd() > current.getEnd()) { - throw new IllegalStateException( - "Illegal sequence item span " + current + " following " + prev); - } - } - - // Check item begin/end - if (current.getBegin() > current.getEnd()) { - throw new IllegalStateException("Illegal sequence item span: " + current); - } - - if (current.getLabel().equals(markOut)) { - if (starter.isPresent()) { - // If there is a starter, there must be a previous - assert previous.isPresent(); - - decoded.add(new SequenceItem(starter.get().getBegin(), previous.get().getEnd(), - starter.get().getLabel())); - } - - starter = Optional.empty(); - } - else if (starter.isPresent()) { - // If there is a starter, there must be a previous - assert previous.isPresent(); - - if (starter.get().getLabel().equals(current.getLabel())) { - // Nothing else to do here. We just continue the already started span. - } - else { - // Commit current span and start a new one - decoded.add(new SequenceItem(starter.get().getBegin(), previous.get().getEnd(), - starter.get().getLabel())); - starter = Optional.of(current); - } - } - else { - starter = Optional.of(current); - } - - previous = Optional.of(current); - } - - // Commit active span at the end of the sequence - if (starter.isPresent()) { - decoded.add(new SequenceItem(starter.get().getBegin(), previous.get().getEnd(), - starter.get().getLabel())); - } - - return decoded; - } - - @Override - public List encode(List aDecoded, int aLength) - { - List encoded = new ArrayList<>(); - - int idx = offset; - - Iterator i = aDecoded.iterator(); - while (i.hasNext()) { - SequenceItem current = i.next(); - - // Check overlap with already seen items - if (idx > current.getBegin()) { - throw new IllegalStateException("Illegal sequence item span: " + current); - } - - // Check item begin/end - if (current.getBegin() > current.getEnd()) { - throw new IllegalStateException("Illegal sequence item span: " + current); - } - - // Generate "outside" items - while (idx < current.getBegin()) { - encoded.add(new SequenceItem(idx, idx, markOut)); - idx++; - } - - // Generate "inside" items - while (idx <= current.getEnd()) { - encoded.add(new SequenceItem(idx, idx, current.getLabel())); - idx++; - } - } - - // Generate "outside" items until the final length is reached - while (idx < aLength + offset) { - encoded.add(new SequenceItem(idx, idx, markOut)); - idx++; - } - - return encoded; - } -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceCodec.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceCodec.java deleted file mode 100644 index 638abebf57c..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceCodec.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright 2018 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec; - -import java.util.List; - -public interface SequenceCodec -{ - List decode(List aEncoded); - - List encode(List aDecoded, int aLength); -} diff --git a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItem.java b/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItem.java deleted file mode 100644 index f59707ac35f..00000000000 --- a/webanno-io-conll/src/main/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItem.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright 2018 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.commons.lang3.builder.ToStringStyle; - -public class SequenceItem -{ - private final int begin; - private final int end; - private final String label; - - public SequenceItem(int aBegin, int aEnd, String aLabel) - { - super(); - begin = aBegin; - end = aEnd; - label = aLabel; - } - - public int getBegin() - { - return begin; - } - - public int getEnd() - { - return end; - } - - public String getLabel() - { - return label; - } - - @Override - public String toString() - { - return new ToStringBuilder(this, ToStringStyle.NO_CLASS_NAME_STYLE) - .append("begin", begin) - .append("end", end) - .append("label", label) - .toString(); - } - - public static List of(String... aLabels) - { - return of(1, aLabels); - } - - public static List of(int aOffset, String... aLabels) - { - int begin = 0; - List result = new ArrayList<>(aLabels.length); - for (String label : aLabels) { - result.add(new SequenceItem(begin + aOffset, begin + aOffset, label)); - begin++; - } - return result; - } - - @Override - public boolean equals(final Object other) - { - if (!(other instanceof SequenceItem)) { - return false; - } - SequenceItem castOther = (SequenceItem) other; - return new EqualsBuilder().append(begin, castOther.begin).append(end, castOther.end) - .append(label, castOther.label).isEquals(); - } - - @Override - public int hashCode() - { - return new HashCodeBuilder().append(begin).append(end).append(label).toHashCode(); - } -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReaderWriterTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReaderWriterTest.java deleted file mode 100644 index d1f39bddb02..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllCoreNlpReaderWriterTest.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; - -import org.apache.commons.io.FileUtils; -import org.apache.uima.analysis_engine.AnalysisEngineDescription; -import org.apache.uima.collection.CollectionReaderDescription; -import org.dkpro.core.testing.DkproTestContext; -import org.junit.Rule; -import org.junit.Test; - -public class ConllCoreNlpReaderWriterTest -{ - @Test - public void roundTrip() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ConllCoreNlpReader.class, - ConllCoreNlpReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/corenlp", - ConllCoreNlpReader.PARAM_PATTERNS, "en-orig.conll"); - - AnalysisEngineDescription writer = createEngineDescription( - ConllCoreNlpWriter.class, - ConllCoreNlpWriter.PARAM_TARGET_LOCATION, "target/test-output/ConllCoreNlpReaderWriterTest-roundTrip", - ConllCoreNlpWriter.PARAM_FILENAME_SUFFIX, ".conll", - ConllCoreNlpWriter.PARAM_STRIP_EXTENSION, true, - ConllCoreNlpWriter.PARAM_OVERWRITE, true); - - runPipeline(reader, writer); - - String reference = FileUtils.readFileToString( - new File("src/test/resources/conll/corenlp/en-ref.conll"), "UTF-8") - .trim(); - String actual = FileUtils.readFileToString( - new File("target/test-output/ConllCoreNlpReaderWriterTest-roundTrip/en-orig.conll"), - "UTF-8").trim(); - - assertThat(actual).isEqualToNormalizingNewlines(reference); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderTest.java deleted file mode 100644 index e0c71850ec9..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderTest.java +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.apache.uima.fit.util.JCasUtil.select; -import static org.dkpro.core.testing.AssertAnnotations.assertMorph; -import static org.dkpro.core.testing.AssertAnnotations.assertPOS; -import static org.dkpro.core.testing.AssertAnnotations.assertSentence; - -import org.apache.uima.collection.CollectionReaderDescription; -import org.apache.uima.fit.pipeline.JCasIterable; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.DkproTestContext; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.morph.MorphologicalFeatures; -import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; - -public class ConllUReaderTest -{ - @Test - public void test() - throws Exception - { - CollectionReaderDescription reader = createReaderDescription( - ConllUReader.class, - ConllUReader.PARAM_LANGUAGE, "en", - ConllUReader.PARAM_SOURCE_LOCATION, "src/test/resources/conll/u/", - ConllUReader.PARAM_PATTERNS, "conllu-en-orig.conll"); - - JCas jcas = new JCasIterable(reader).iterator().next(); - - String[] sentences = { - "They buy and sell books.", - "I have not a clue." }; - - String[] posMapped = { "POS", "POS_VERB", "POS_CONJ", "POS_VERB", "POS_NOUN", "POS_PUNCT", "POS", "POS_VERB", "POS_ADV", - "POS_DET", "POS_NOUN", "POS_PUNCT" }; - - String[] posOriginal = { "PRN", "VB", "CC", "VB", "NNS", ".", "PRN", "VB", "RB", "DT", "NN", - "." }; - - String[] morphologicalFeeatures = { - "[ 0, 4] - - Nom - - - - - Plur - - - - - - - - They (Case=Nom|Number=Plur)", - "[ 5, 8] - - - - - - - - Plur - 3 - - - Pres - - buy (Number=Plur|Person=3|Tense=Pres)", - "[ 13, 17] - - - - - - - - Plur - 3 - - - Pres - - sell (Number=Plur|Person=3|Tense=Pres)", - "[ 18, 23] - - - - - - - - Plur - - - - - - - - books (Number=Plur)", - "[ 25, 26] - - Nom - - - - - Sing - 1 - - - - - - I (Case=Nom|Number=Sing|Person=1)", - "[ 27, 31] - - - - - - - - Sing - 1 - - - Pres - - have (Number=Sing|Person=1|Tense=Pres)", - "[ 32, 35] - - - - - - - Neg - - - - - - - - - not (Negative=Neg)", - "[ 36, 37] - - - - - - - - - - - - Art - - - - a (Definite=Ind|PronType=Art)", - "[ 38, 42] - - - - - - - - Sing - - - - - - - - clue (Number=Sing)" - }; - - assertSentence(sentences, select(jcas, Sentence.class)); - assertPOS(posMapped, posOriginal, select(jcas, POS.class)); - assertMorph(morphologicalFeeatures, select(jcas, MorphologicalFeatures.class)); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderWriterTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderWriterTest.java deleted file mode 100644 index be6ecf7765c..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUReaderWriterTest.java +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Copyright 2016 - * Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription; -import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription; -import static org.dkpro.core.testing.IOTestRunner.testOneWay; -import static org.dkpro.core.testing.IOTestRunner.testRoundTrip; - -import org.dkpro.core.testing.DkproTestContext; -import org.junit.Ignore; -import org.junit.Rule; -import org.junit.Test; - -public class ConllUReaderWriterTest -{ - @Test - public void roundTrip() - throws Exception - { - testRoundTrip( - createReaderDescription(ConllUReader.class), - createEngineDescription(ConllUWriter.class, - ConllUWriter.PARAM_WRITE_TEXT_COMMENT, false), - "conll/u/conllu-en-orig.conll"); - } - - @Ignore("This unfortunately doesn't work yet.") - @Test - public void roundTripV2EmptyNodes() - throws Exception - { - testRoundTrip(ConllUReader.class, ConllUWriter.class, "conll/u_v2/conllu-empty_nodes.conll"); - } - - @Test - public void roundTripV2MorphologicalAnnotation() - throws Exception - { - testRoundTrip(ConllUReader.class, ConllUWriter.class, "conll/u_v2/conllu-morphological_annotation.conll"); - } - - @Ignore("This unfortunately doesn't work yet.") - @Test - public void roundTripV2ParagraphAndDocumentBoundaries() - throws Exception - { - testRoundTrip( - createReaderDescription(ConllUReader.class), - createEngineDescription(ConllUWriter.class, - ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), - "conll/u_v2/conllu-paragraph_and_document_boundaries.conll"); - } - - @Test - public void roundTripV2SentenceBoundariesAndComments() - throws Exception - { - testRoundTrip( - createReaderDescription(ConllUReader.class), - createEngineDescription(ConllUWriter.class, - ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), - "conll/u_v2/conllu-sentence_bounaries_and_comments.conll"); - } - - @Test - public void roundTripV2SyntacticAnnotation() - throws Exception - { - testRoundTrip(ConllUReader.class, ConllUWriter.class, "conll/u_v2/conllu-syntactic_annotation.conll"); - } - - @Ignore("This unfortunately doesn't work yet.") - @Test - public void roundTripV2UntokenizedText() - throws Exception - { - testRoundTrip( - createReaderDescription(ConllUReader.class), - createEngineDescription(ConllUWriter.class, - ConllUWriter.PARAM_WRITE_TEXT_COMMENT, true), - "conll/u_v2/conllu-untokenized_text.conll"); - } - - @Test - public void roundTripV2WordsAndTokens() - throws Exception - { - testRoundTrip(ConllUReader.class, ConllUWriter.class, "conll/u_v2/conllu-words_and_tokens.conll"); - } - - @Test - public void withComments() - throws Exception - { - testOneWay( - createReaderDescription(ConllUReader.class), - createEngineDescription(ConllUWriter.class, - ConllUWriter.PARAM_WRITE_TEXT_COMMENT, false), - "conll/u/conllu-en-ref.conll", - "conll/u/conllu-en-orig2.conll"); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriterTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriterTest.java deleted file mode 100644 index 6790358a4ec..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/ConllUWriterTest.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright 2019 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll; - -import static org.apache.commons.io.FileUtils.readFileToString; -import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine; -import static org.assertj.core.api.Assertions.assertThat; - -import java.io.File; - -import org.apache.uima.analysis_engine.AnalysisEngine; -import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; -import org.dkpro.core.testing.DkproTestContext; -import org.junit.Rule; -import org.junit.Test; - -import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; -import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; - -public class ConllUWriterTest -{ - @Test - public void thatLineBreaksDoNotBreakTheFormat() throws Exception - { - String target = "target/test-output/" + testContext.getTestOutputFolderName(); - - JCas jcas = JCasFactory.createText("Test\ntest."); - new Sentence(jcas, 0, 10).addToIndexes(); - new Token(jcas, 0, 4).addToIndexes(); - new Token(jcas, 5, 9).addToIndexes(); - new Token(jcas, 9, 10).addToIndexes(); - - DocumentMetaData dmd = DocumentMetaData.create(jcas); - dmd.setDocumentId("output"); - - AnalysisEngine writer = createEngine(ConllUWriter.class, - ConllUWriter.PARAM_TARGET_LOCATION, target, - ConllUWriter.PARAM_OVERWRITE, true); - - writer.process(jcas); - - String reference = readFileToString( - new File("src/test/resources/conll/u_v2/conllu-linebreaks.conll"), "UTF-8").trim(); - String actual = readFileToString(new File(target, "output.conll"), "UTF-8").trim(); - - assertThat(actual).isEqualToNormalizingNewlines(reference); - } - - @Rule - public DkproTestContext testContext = new DkproTestContext(); -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodecTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodecTest.java deleted file mode 100644 index d288945f3d6..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/AdjacentLabelCodecTest.java +++ /dev/null @@ -1,175 +0,0 @@ -/* - * Copyright 2018 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec; - -import static java.util.Arrays.asList; -import static org.assertj.core.api.Assertions.assertThat; -import static org.assertj.core.api.Assertions.assertThatExceptionOfType; - -import java.util.Collection; -import java.util.List; - -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - -@RunWith(Parameterized.class) -public class AdjacentLabelCodecTest -{ - private AdjacentLabelCodec sut; - private int offset; - - @Parameters - public static Collection data() { - return asList(new Object[][] { { 0 }, { 1 } }); - } - - public AdjacentLabelCodecTest(int aOffset) - { - offset = aOffset; - } - - @Before - public void setup() - { - sut = new AdjacentLabelCodec(offset); - } - - @Test - public void testDecodeEmpty() - { - List encoded = SequenceItem.of(offset); - List decoded = sut.decode(encoded); - assertThat(decoded).containsExactly(); - } - - @Test - public void testDecodeSingleValidItem() - { - List encoded = SequenceItem.of(offset, "PER"); - List decoded = sut.decode(encoded); - assertThat(decoded).containsExactly(new SequenceItem(0 + offset, 0 + offset, "PER")); - } - - @Test - public void testDecodeMultiUnitSpan() - { - List encoded = SequenceItem.of(offset, "O", "PER", "PER", "O"); - List decoded = sut.decode(encoded); - assertThat(decoded).containsExactly(new SequenceItem(1 + offset, 2 + offset, "PER")); - } - - @Test - public void testDecodeTwoAdjacentUnitsWithDifferentLabels() - { - List encoded = SequenceItem.of(offset, "O", "PER", "ORG", "O"); - List decoded = sut.decode(encoded); - assertThat(decoded).containsExactly( - new SequenceItem(1 + offset, 1 + offset, "PER"), - new SequenceItem(2 + offset, 2 + offset, "ORG")); - } - - @Test - public void testDecodeEndSmallerThanBegin() - { - List encoded = asList(new SequenceItem(1 + offset, 0 + offset, "O")); - - assertThatExceptionOfType(IllegalStateException.class) - .isThrownBy(() -> sut.decode(encoded)) - .withMessageContaining("Illegal sequence item span"); - } - - @Test - public void testDecodeBadItemOrder() - { - List encoded = asList( - new SequenceItem(1 + offset, 1 + offset, "O"), - new SequenceItem(0 + offset, 0 + offset, "O")); - - assertThatExceptionOfType(IllegalStateException.class) - .isThrownBy(() -> sut.decode(encoded)) - .withMessageContaining("Illegal sequence item span"); - } - - @Test - public void testEncodeSingleUnitSingleItem() - { - List decoded = asList(new SequenceItem(0 + offset, 0 + offset, "PER")); - List encoded = sut.encode(decoded, 1); - assertThat(encoded).containsExactly(new SequenceItem(0 + offset, 0 + offset, "PER")); - } - - @Test - public void testEncodeMultipleUnitsSingleItem() - { - List decoded = asList(new SequenceItem(0 + offset, 1 + offset, "PER")); - List encoded = sut.encode(decoded, 2); - assertThat(encoded).containsExactly( - new SequenceItem(0 + offset, 0 + offset, "PER"), - new SequenceItem(1 + offset, 1 + offset, "PER")); - } - - @Test - public void testEncodeMultipleItems() - { - List decoded = asList( - new SequenceItem(0 + offset, 0 + offset, "PER"), - new SequenceItem(1 + offset, 1 + offset, "ORG")); - List encoded = sut.encode(decoded, 2); - assertThat(encoded).containsExactly( - new SequenceItem(0 + offset, 0 + offset, "PER"), - new SequenceItem(1 + offset, 1 + offset, "ORG")); - } - - @Test - public void testEncodeMultipleItemsWithGap() - { - List decoded = asList( - new SequenceItem(0 + offset, 0 + offset, "PER"), - new SequenceItem(2 + offset, 2 + offset, "ORG")); - List encoded = sut.encode(decoded, 3); - assertThat(encoded).containsExactly( - new SequenceItem(0 + offset, 0 + offset, "PER"), - new SequenceItem(1 + offset, 1 + offset, "O"), - new SequenceItem(2 + offset, 2 + offset, "ORG")); - } - - @Test - public void testEncodeBadItemSpan() - { - List encoded = asList(new SequenceItem(2 + offset, 1 + offset, "PER")); - - assertThatExceptionOfType(IllegalStateException.class) - .isThrownBy(() -> sut.encode(encoded, 2)) - .withMessageContaining("Illegal sequence item span"); - } - - @Test - public void testEncodeBadItemOrder() - { - List encoded = asList( - new SequenceItem(1 + offset, 1 + offset, "PER"), - new SequenceItem(0 + offset, 0 + offset, "ORG")); - - assertThatExceptionOfType(IllegalStateException.class) - .isThrownBy(() -> sut.encode(encoded, 2)) - .withMessageContaining("Illegal sequence item span"); - } -} diff --git a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItemTest.java b/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItemTest.java deleted file mode 100644 index 69d2c643656..00000000000 --- a/webanno-io-conll/src/test/java/de/tudarmstadt/ukp/clarin/webanno/conll/sequencecodec/SequenceItemTest.java +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright 2018 - * Ubiquitous Knowledge Processing (UKP) Lab - * Technische Universität Darmstadt - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.clarin.webanno.conll.sequencecodec; - -import static org.assertj.core.api.Assertions.assertThat; - -import java.util.List; - -import org.junit.Test; - -public class SequenceItemTest -{ - @Test - public void testSpanSequenceConstructionWithDefaultOffset() - { - List sequence = SequenceItem.of("O", "B-PER", "I-PER", "O"); - - assertThat(sequence).containsExactly( - new SequenceItem(1, 1, "O"), - new SequenceItem(2, 2, "B-PER"), - new SequenceItem(3, 3, "I-PER"), - new SequenceItem(4, 4, "O")); - } - - @Test - public void testSpanSequenceConstructionWithExplicitOffset() - { - List sequence = SequenceItem.of(0, "O", "B-PER", "I-PER", "O"); - - assertThat(sequence).containsExactly( - new SequenceItem(0, 0, "O"), - new SequenceItem(1, 1, "B-PER"), - new SequenceItem(2, 2, "I-PER"), - new SequenceItem(3, 3, "O")); - } -}