Skip to content

Commit

Permalink
#4533 - BioC exported from INCEpTION cannot be imported again due to …
Browse files Browse the repository at this point in the history
…missing mandatory metadata

- Make sure that key, date and source are generated if they are not present in the CAS when writing BioC
- Make reading BioC more robust so that it does not fail if key, date or source are missing
  • Loading branch information
reckart committed Feb 21, 2024
1 parent b764d7e commit dd4f354
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@
package de.tudarmstadt.ukp.inception.io.bioc;

import static de.tudarmstadt.ukp.inception.support.xml.XmlParserUtils.isStartElement;
import static java.util.Arrays.asList;

import java.io.IOException;
import java.io.InputStream;
import java.util.Optional;

import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
Expand Down Expand Up @@ -147,24 +149,40 @@ protected void readCollectionMetdata() throws XMLStreamException
while ((collectionSource == null || collectionDate == null || collectionKey == null)
&& xmlEventReader.hasNext()) {
var event = xmlEventReader.nextEvent();
if (event.isStartElement()) {
if (event.asStartElement().getName().getLocalPart().equals(E_KEY)) {
event = xmlEventReader.nextEvent();
collectionKey = event.asCharacters().getData();
xmlEventReader.nextEvent(); // Reader closing element
}
else if (event.asStartElement().getName().getLocalPart().equals(E_SOURCE)) {
event = xmlEventReader.nextEvent();
collectionSource = event.asCharacters().getData();
xmlEventReader.nextEvent(); // Reader closing element
}
else if (event.asStartElement().getName().getLocalPart().equals(E_DATE)) {
event = xmlEventReader.nextEvent();
collectionDate = event.asCharacters().getData();
xmlEventReader.nextEvent(); // Reader closing element

tryReadingMetadata(event, E_SOURCE).ifPresent($ -> collectionSource = $);
tryReadingMetadata(event, E_DATE).ifPresent($ -> collectionDate = $);
tryReadingMetadata(event, E_KEY).ifPresent($ -> collectionKey = $);

if (xmlEventReader.hasNext()) {
var nextEvent = xmlEventReader.peek();
if (nextEvent.isStartElement() && asList(E_DOCUMENT, E_INFON)
.contains(nextEvent.asStartElement().getName().getLocalPart())) {
// Make sure we do not consume the documents while looking for collection
// metadata. While all metadata fields are mandatory in BioC, it does not
// mean that some of them may not be missing anyway...
break;
}
}
}
}
}

private Optional<String> tryReadingMetadata(XMLEvent event, String element)
throws XMLStreamException
{
if (event.isStartElement()) {
if (event.asStartElement().getName().getLocalPart().equals(element)) {
event = xmlEventReader.nextEvent();
if (event.isCharacters()) {
return Optional.of(event.asCharacters().getData());
}
else if (!event.isEndElement()) {
xmlEventReader.next(); // Reader closing element
}
}
}

return Optional.empty();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@

import static de.tudarmstadt.ukp.inception.io.bioc.BioCComponent.getCollectionMetadataField;

import java.time.LocalDate;
import java.time.format.DateTimeFormatter;

import javax.xml.bind.JAXBContext;
import javax.xml.bind.JAXBException;
import javax.xml.bind.Marshaller;
Expand Down Expand Up @@ -80,8 +83,10 @@ public void initialize(UimaContext aContext) throws ResourceInitializationExcept
@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException
{
var formatter = DateTimeFormatter.ofPattern("yyyyMMdd");

try (var docOS = getOutputStream(aJCas, filenameSuffix)) {
Marshaller marshaller = context.createMarshaller();
var marshaller = context.createMarshaller();
marshaller.setProperty(Marshaller.JAXB_FORMATTED_OUTPUT, true);
// Set to fragment mode to omit XML declaration
marshaller.setProperty(Marshaller.JAXB_FRAGMENT, true);
Expand All @@ -91,6 +96,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException
// Base-information - may be overwritten by the metadata fields below
var dmd = DocumentMetaData.get(aJCas);
bioCCollection.setSource(dmd.getCollectionId());
bioCCollection.setKey(dmd.getDocumentId());
bioCCollection.setDate(LocalDate.now().format(formatter));

// Use BioC metadata fields if available
getCollectionMetadataField(aJCas.getCas(), E_SOURCE)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,19 @@ void testReadMultipleFromOneFile() throws Exception
assertThat(texts) //
.containsExactly("Document 1 text.", "Document 2 text.", "Document 3 text.");
}

@Test
void testReadFileWithIncompleteMetadata() throws Exception
{
var reader = createReaderDescription( //
BioCReader.class, //
BioCReader.PARAM_SOURCE_LOCATION,
"src/test/resources/bioc/example-with-incomplete-metadata.xml");

var texts = new ArrayList<String>();
iteratePipeline(reader).forEach(cas -> texts.add(cas.getDocumentText().trim()));

assertThat(texts) //
.containsExactly("Document 1 text.", "Document 2 text.", "Document 3 text.");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.io.bioc;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.assertj.core.api.Assertions.assertThat;

import java.io.File;

import org.apache.uima.fit.factory.CasFactory;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;

class BioCWriterTest
{
@Test
void thatMetadataIsGeneratedToOutput(@TempDir File aTmp) throws Exception
{
var cas = CasFactory.createCas();
cas.setDocumentText("This is a test");

var dmd = DocumentMetaData.create(cas);
dmd.setCollectionId("collectionId");
dmd.setDocumentId("documentId");

var writer = createEngine( //
BioCWriter.class, //
BioCWriter.PARAM_TARGET_LOCATION, aTmp);

writer.process(cas);

var out = new File(aTmp, "documentId.xml");
assertThat(out).exists() //
.content() //
.contains("<source>collectionId</source>") //
.contains("<date>") //
.contains("<key>documentId</key>");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<collection>
<key/>
<document>
<id>1</id>
<passage>
<offset>0</offset>
<text>Document 1 text.</text>
</passage>
</document>
<document>
<id>2</id>
<passage>
<offset>0</offset>
<text>Document 2 text.</text>
</passage>
</document>
<document>
<id>3</id>
<passage>
<offset>0</offset>
<text>Document 3 text.</text>
</passage>
</document>
</collection>

0 comments on commit dd4f354

Please sign in to comment.