Skip to content

Commit

Permalink
#4567 - Add support for a generic CAS RDF export format
Browse files Browse the repository at this point in the history
- Switch from Jena to RDF4J
  • Loading branch information
reckart committed Feb 29, 2024
1 parent 3ea91a1 commit a97c0c2
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ private void step() throws IOException
.stripCompressionExtension(res.getLocation()))
.orElse(RDFXML);
model = Rio.parse(is, res.getLocation().toString(), format);
}
contextIterator = model.filter(null, RDF.TYPE, vf.createIRI(RdfCas.TYPE_VIEW)).iterator();
}

contextIterator = model.filter(null, RDF.TYPE, RdfCas.TYPE_VIEW).iterator();
}
else {
// No more files to read
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ public class RdfWriter
/**
* Specify the suffix of output files. Default value <code>.ttl</code>. The file format will be
* chosen depending on the file suffice.
*
* @see RDFLanguages
*/
public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION;
@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".ttl")
Expand All @@ -61,20 +59,20 @@ public class RdfWriter
private Set<String> iriFeatures;

private Uima2Rdf uima2rdf;

@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException
{
super.initialize(aContext);

uima2rdf = new Uima2Rdf(iriFeatures);
}

@Override
public void process(JCas aJCas) throws AnalysisEngineProcessException
{
var model = new DynamicModelFactory().createEmptyModel();

try {
uima2rdf.convert(aJCas, model);
}
Expand All @@ -83,10 +81,8 @@ public void process(JCas aJCas) throws AnalysisEngineProcessException
}

try (var docOS = getOutputStream(aJCas, filenameSuffix)) {
var format = Rio
.getParserFormatForFileName(filenameSuffix)
.orElse(RDFXML);
Rio.write(model, docOS, format);
var format = Rio.getParserFormatForFileName(filenameSuffix).orElse(RDFXML);
Rio.write(model, docOS, format);
}
catch (Exception e) {
throw new AnalysisEngineProcessException(e);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.io.rdf.internal;

import org.eclipse.rdf4j.model.base.AbstractIRI;

public class BasicIRI extends AbstractIRI
{
private static final long serialVersionUID = 4794310809421877727L;

private final String namespace;
private final String localName;

public BasicIRI(String aNamespace, String aLocalName)
{
namespace = aNamespace;
localName = aLocalName;
}

@Override
public String getNamespace()
{
return namespace;
}

@Override
public String getLocalName()
{
return localName;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,19 @@
*/
package de.tudarmstadt.ukp.inception.io.rdf.internal;

import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.NS_RDFCAS;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.SCHEME_UIMA;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.PROP_INDEXED_IN;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.PROP_SOFA_ID;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.PROP_SOFA_MIME_TYPE;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.PROP_SOFA_STRING;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.TYPE_FEATURE_STRUCTURE;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.TYPE_VIEW;
import static org.apache.commons.lang3.StringUtils.substringAfterLast;

import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FeatureStructure;
Expand All @@ -31,7 +40,6 @@
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;

import de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData;
Expand All @@ -40,27 +48,21 @@ public class Rdf2Uima
{
public static void convert(Model aModel, Statement aContext, JCas aJCas) throws CASException
{
var vf = SimpleValueFactory.getInstance();
var m = aModel;

// Set up names
var tView = vf.createIRI(RdfCas.TYPE_VIEW);
var tFeatureStructure = vf.createIRI(RdfCas.TYPE_FEATURE_STRUCTURE);
var pIndexedIn = vf.createIRI(RdfCas.PROP_INDEXED_IN);

var fsIndex = new HashMap<Resource, FeatureStructure>();

// Convert the views/SofAs
var viewIndex = new HashMap<Resource, JCas>();
for (var view : aModel.filter(null, RDF.TYPE, tView).subjects()) {
for (var view : aModel.filter(null, RDF.TYPE, TYPE_VIEW).subjects()) {
var viewJCas = convertView(aModel, view, aJCas);
viewIndex.put(view, viewJCas);
fsIndex.put(view, viewJCas.getSofa());
}

// Convert the FSes but without setting their feature values yet - we cannot fill
// the feature values just set because some of them may point to FSes not yet created
var fses = m.filter(null, RDF.TYPE, tFeatureStructure).subjects();
var fses = m.filter(null, RDF.TYPE, TYPE_FEATURE_STRUCTURE).subjects().toArray(Resource[]::new);
for (var fs : fses) {
var uimaFS = initFS(aModel, fs, aJCas);
fsIndex.put(fs, uimaFS);
Expand All @@ -73,7 +75,7 @@ public static void convert(Model aModel, Statement aContext, JCas aJCas) throws

// Finally add the FSes to the indexes of the respective views
for (var fs : fses) {
for (var indexedIn : aModel.filter(fs, pIndexedIn, null).objects()) {
for (var indexedIn : aModel.filter(fs, PROP_INDEXED_IN, null).objects()) {
var viewJCas = viewIndex.get(indexedIn);
viewJCas.addFsToIndexes(fsIndex.get(fs));
}
Expand All @@ -82,17 +84,10 @@ public static void convert(Model aModel, Statement aContext, JCas aJCas) throws

public static JCas convertView(Model aModel, Resource aView, JCas aJCas) throws CASException
{
var vf = SimpleValueFactory.getInstance();

// Set up names
var pSofaID = vf.createIRI(RdfCas.PROP_SOFA_ID);
var pSofaString = vf.createIRI(RdfCas.PROP_SOFA_STRING);
var pSofaMimeType = vf.createIRI(RdfCas.PROP_SOFA_MIME_TYPE);

// Get the values
var viewName = aModel.filter(aView, pSofaID, null).objects().iterator().next().stringValue();
var sofaString = aModel.filter(aView, pSofaString, null).objects().iterator().next().stringValue();
var sofaMimeType = aModel.filter(aView, pSofaMimeType, null).objects().iterator().next().stringValue();
var viewName = aModel.filter(aView, PROP_SOFA_ID, null).objects().iterator().next().stringValue();
var sofaString = aModel.filter(aView, PROP_SOFA_STRING, null).objects().iterator().next().stringValue();
var sofaMimeType = aModel.filter(aView, PROP_SOFA_MIME_TYPE, null).objects().iterator().next().stringValue();

// Instantiate the view/SofA
var view = JCasUtil.getView(aJCas, viewName, true);
Expand All @@ -107,10 +102,10 @@ public static FeatureStructure initFS(Model aModel, Resource aFS, JCas aJCas)

// Figure out the UIMA type - there can be only one type per FS
var types = aModel.filter(aFS, RDF.TYPE, null).objects();
types.removeIf(res -> res.stringValue().startsWith(RdfCas.NS_RDFCAS));
types.removeIf(res -> res.stringValue().startsWith(NS_RDFCAS));
assert types.size() == 1;
var type = CasUtil.getType(cas,
types.iterator().next().stringValue().substring(RdfCas.NS_UIMA.length()));
types.iterator().next().stringValue().substring(SCHEME_UIMA.length()));

FeatureStructure fs;
if (type.getName().equals(DocumentMetaData.class.getName())) {
Expand All @@ -135,7 +130,7 @@ public static FeatureStructure convertFS(Model aModel, Resource aFS, JCas aJCas,
continue;
}

var featureName = StringUtils.substringAfterLast(stmt.getPredicate().stringValue(), "-");
var featureName = substringAfterLast(stmt.getPredicate().stringValue(), "-");
var uimaFeat = fs.getType().getFeatureByBaseName(featureName);

// Cannot update start/end of document annotation because that FS is already indexed, so
Expand All @@ -149,7 +144,7 @@ public static FeatureStructure convertFS(Model aModel, Resource aFS, JCas aJCas,
if (uimaFeat.getRange().isPrimitive()) {
Literal literal = null;
if (stmt.getObject().isLiteral()) {
literal = (Literal) stmt;
literal = (Literal) stmt.getObject();
}

switch (uimaFeat.getRange().getName()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
package de.tudarmstadt.ukp.inception.io.rdf.internal;

import org.apache.uima.cas.CAS;
import org.eclipse.rdf4j.model.IRI;

/**
* RDF CAS vocabulary.
Expand All @@ -27,19 +28,18 @@ public class RdfCas
public static final String PREFIX_RDFCAS = "rdfcas";

public static final String NS_RDFCAS = "http://uima.apache.org/rdf/cas#";
public static final String NS_UIMA = "uima:";
public static final String SCHEME_UIMA = "uima:";

public static final String PROP_VIEW = NS_RDFCAS + "view";
public static final String PROP_INDEXED_IN = NS_RDFCAS + "indexedIn";
public static final IRI PROP_VIEW = new BasicIRI(NS_RDFCAS, "view");
public static final IRI PROP_INDEXED_IN =new BasicIRI(NS_RDFCAS, "indexedIn");

// public static final String TYPE_CAS = NS_RDFCAS + "CAS";
public static final String TYPE_VIEW = NS_RDFCAS + "View";
public static final String TYPE_FEATURE_STRUCTURE = NS_RDFCAS + "FeatureStructure";
public static final IRI TYPE_VIEW = new BasicIRI(NS_RDFCAS, "View");
public static final IRI TYPE_FEATURE_STRUCTURE = new BasicIRI(NS_RDFCAS, "FeatureStructure");

public static final String PROP_SOFA_ID = NS_UIMA + CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFAID;
public static final String PROP_SOFA_STRING = NS_UIMA + CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFASTRING;
public static final String PROP_SOFA_MIME_TYPE = NS_UIMA + CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFAMIME;
public static final IRI PROP_SOFA_ID = new BasicIRI(SCHEME_UIMA, CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFAID);
public static final IRI PROP_SOFA_STRING = new BasicIRI(SCHEME_UIMA, CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFASTRING);
public static final IRI PROP_SOFA_MIME_TYPE = new BasicIRI(SCHEME_UIMA, CAS.TYPE_NAME_SOFA + '-'
+ CAS.FEATURE_BASE_NAME_SOFAMIME);
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
*/
package de.tudarmstadt.ukp.inception.io.rdf.internal;

import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.PREFIX_RDFCAS;
import static de.tudarmstadt.ukp.inception.io.rdf.internal.RdfCas.SCHEME_UIMA;
import static java.lang.String.format;

import java.util.HashSet;
Expand All @@ -31,6 +33,7 @@
import org.apache.uima.jcas.JCas;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Model;
import org.eclipse.rdf4j.model.Namespace;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import de.tudarmstadt.ukp.clarin.webanno.diag.CasDoctorUtils;
Expand All @@ -56,9 +59,9 @@ public void convert(JCas aJCas, Model aTarget) throws CASException
{
// Set up prefix mappings
var ts = aJCas.getTypeSystem();
aTarget.setNamespace("cas", RdfCas.NS_UIMA + "uima.cas.");
aTarget.setNamespace("tcas", RdfCas.NS_UIMA + "uima.tcas.");
aTarget.setNamespace(RdfCas.PREFIX_RDFCAS, RdfCas.NS_RDFCAS);
aTarget.setNamespace("cas", SCHEME_UIMA + "uima.cas.");
aTarget.setNamespace("tcas", SCHEME_UIMA + "uima.tcas.");
aTarget.setNamespace(PREFIX_RDFCAS, RdfCas.NS_RDFCAS);

// Additional prefix mappings for DKPro Core typesystems
for (var t : ts.getProperlySubsumedTypes(ts.getTopType())) {
Expand All @@ -73,7 +76,7 @@ public void convert(JCas aJCas, Model aTarget) throws CASException
if (nameMatcher.group("INMODULE") != null) {
prefix = prefix + "-" + nameMatcher.group("INMODULE");
}
aTarget.setNamespace(prefix, RdfCas.NS_UIMA + nameMatcher.group("LONG"));
aTarget.setNamespace(prefix, SCHEME_UIMA + nameMatcher.group("LONG"));
}
}

Expand All @@ -87,11 +90,6 @@ private void convertView(JCas aJCas, Model aTarget)
{
var vf = SimpleValueFactory.getInstance();

// Set up names
var tView = vf.createIRI(RdfCas.TYPE_VIEW);
var tFeatureStructure = vf.createIRI(RdfCas.TYPE_FEATURE_STRUCTURE);
var pIndexedIn = vf.createIRI(RdfCas.PROP_INDEXED_IN);

// Get a URI for the document
var dmd = DocumentMetaData.get(aJCas);
var docuri = dmd.getDocumentUri() != null ? dmd.getDocumentUri()
Expand All @@ -104,23 +102,23 @@ private void convertView(JCas aJCas, Model aTarget)
reachable.add(aJCas.getSofa());

// Set up the view itself
var viewUri = format("%s#%d", docuri, aJCas.getLowLevelCas().ll_getFSRef(aJCas.getSofa()));
var rdfView = vf.createIRI(viewUri);
aTarget.add(rdfView, RDF.TYPE, tView);
var rdfView = vf.createIRI(
format("%s#%d", docuri, aJCas.getLowLevelCas().ll_getFSRef(aJCas.getSofa())));
aTarget.add(rdfView, RDF.TYPE, RdfCas.TYPE_VIEW);

for (var uimaFS : reachable) {
var uri = format("%s#%d", docuri, aJCas.getLowLevelCas().ll_getFSRef(uimaFS));
var rdfFS = vf.createIRI(uri);
aTarget.add(rdfFS, RDF.TYPE, vf.createIRI(rdfType(uimaFS.getType())));
aTarget.add(rdfFS, RDF.TYPE, rdfType(aTarget, uimaFS.getType()));

// The SoFa is not a regular FS - do not mark it as such
if (uimaFS != aJCas.getSofa()) {
aTarget.add(rdfFS, RDF.TYPE, tFeatureStructure);
aTarget.add(rdfFS, RDF.TYPE, RdfCas.TYPE_FEATURE_STRUCTURE);
}

// Internal UIMA information
if (indexed.contains(uimaFS)) {
aTarget.add(rdfFS, pIndexedIn, rdfView);
aTarget.add(rdfFS, RdfCas.PROP_INDEXED_IN, rdfView);
}

// Convert features
Expand All @@ -133,7 +131,7 @@ private void convertFeatures(Model aTarget, String docuri, FeatureStructure uima
var vf = SimpleValueFactory.getInstance();

for (var uimaFeat : uimaFS.getType().getFeatures()) {
var rdfFeat = vf.createIRI(rdfFeature(uimaFeat));
var rdfFeat = rdfFeature(aTarget, uimaFeat);
if (uimaFeat.getRange().isPrimitive()) {
switch (uimaFeat.getRange().getName()) {
case CAS.TYPE_NAME_BOOLEAN:
Expand Down Expand Up @@ -189,13 +187,32 @@ private static String rdfUri(String docuri, FeatureStructure uimaFS)
return format("%s#%d", docuri, uimaFS.getCAS().getLowLevelCAS().ll_getFSRef(uimaFS));
}

private static String rdfFeature(Feature aUimaFeature)
private static IRI rdfFeature(Model aModel, Feature aUimaFeature)
{
return rdfType(aUimaFeature.getDomain()) + "-" + aUimaFeature.getShortName();
var typeIri = rdfType(aModel, aUimaFeature.getDomain());
return new BasicIRI(typeIri.getNamespace(),
typeIri.getLocalName() + "-" + aUimaFeature.getShortName());
}

private static String rdfType(Type aUimaType)
private static IRI rdfType(Model aModel, Type aUimaType)
{
return RdfCas.NS_UIMA + aUimaType.getName();
Namespace bestNs = null;
for (var ns : aModel.getNamespaces()) {
var nsName = ns.getName().substring(SCHEME_UIMA.length());
if (aUimaType.getName().startsWith(nsName)
&& (bestNs == null || nsName.length() > bestNs.getName().length())) {
bestNs = ns;
}
}

var vf = SimpleValueFactory.getInstance();
if (bestNs != null) {
var namespace = bestNs.getName();
var localName = aUimaType.getName()
.substring(bestNs.getName().length() - SCHEME_UIMA.length());
return new BasicIRI(namespace, localName);
}

return vf.createIRI(SCHEME_UIMA + aUimaType.getName());
}
}
Loading

0 comments on commit a97c0c2

Please sign in to comment.