Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#1565 - Option to replace illegal characters in XMI files #1566

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dkpro-core-build/src/main/resources/dkpro-core/checkstyle.xml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
</module>

<module name="TreeWalker">
<module name="SuppressionCommentFilter"/>

<module name="OuterTypeFilename"/>
<module name="IllegalTokenText">
<property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
Expand Down
4 changes: 0 additions & 4 deletions dkpro-core-io-xmi-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,6 @@
<groupId>org.apache.uima</groupId>
<artifactId>uimafit-core</artifactId>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-io-asl</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
package org.dkpro.core.io.xmi;

import static java.util.Arrays.asList;
import static org.apache.commons.io.IOUtils.closeQuietly;

import java.io.File;
import java.io.IOException;
Expand All @@ -42,6 +41,7 @@
import org.dkpro.core.api.parameter.ComponentParameters;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.io.xmi.internal.IllegalXmlCharacterSanitizingContentHandler;
import org.xml.sax.SAXException;

import eu.openminted.share.annotations.api.DocumentationResource;
Expand All @@ -51,28 +51,26 @@
*/
@ResourceMetaData(name = "UIMA XMI CAS Writer")
@DocumentationResource("${docbase}/format-reference.html#format-${command}")
@MimeTypeCapability({MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI})
@TypeCapability(
inputs = {
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData"})
@MimeTypeCapability({ MimeTypes.APPLICATION_VND_XMI_XML, MimeTypes.APPLICATION_X_UIMA_XMI })
@TypeCapability( //
inputs = { //
"de.tudarmstadt.ukp.dkpro.core.api.metadata.type.DocumentMetaData" })
public class XmiWriter
extends JCasFileWriter_ImplBase
{
/**
* Format and indent the XML.
*/
public static final String PARAM_PRETTY_PRINT = "prettyPrint";
@ConfigurationParameter(name = PARAM_PRETTY_PRINT, mandatory = true, defaultValue = "true")
@ConfigurationParameter(name = PARAM_PRETTY_PRINT, defaultValue = "true")
private boolean prettyPrint;

/**
* Location to write the type system to. If this is not set, a file called typesystem.xml will
* be written to the XMI output path. If this is set, it is expected to be a file relative
* to the current work directory or an absolute file.
* <br>
* If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the
* type system. Instead, if the file name ends in ".gz", the file will be compressed,
* otherwise not.
* be written to the XMI output path. If this is set, it is expected to be a file relative to
* the current work directory or an absolute file. <br>
* If this parameter is set, the {@link #PARAM_COMPRESSION} parameter has no effect on the type
* system. Instead, if the file name ends in ".gz", the file will be compressed, otherwise not.
*/
public static final String PARAM_TYPE_SYSTEM_FILE = "typeSystemFile";
@ConfigurationParameter(name = PARAM_TYPE_SYSTEM_FILE, mandatory = false)
Expand All @@ -82,47 +80,60 @@ public class XmiWriter
* Specify the suffix of output files. Default value <code>.xmi</code>. If the suffix is not
* needed, provide an empty string as value.
*/
public static final String PARAM_FILENAME_EXTENSION =
ComponentParameters.PARAM_FILENAME_EXTENSION;
@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, mandatory = true, defaultValue = ".xmi")
public static final String PARAM_FILENAME_EXTENSION = ComponentParameters.PARAM_FILENAME_EXTENSION;
@ConfigurationParameter(name = PARAM_FILENAME_EXTENSION, defaultValue = ".xmi")
private String filenameSuffix;

/**
* Defines the XML version used for serializing the data. The default is XML {@code "1.0"}.
* However, XML 1.0 does not support certain Unicode characters. To support a wider range of
* Defines the XML version used for serializing the data. The default is XML {@code "1.0"}.
* However, XML 1.0 does not support certain Unicode characters. To support a wider range of
* characters, you can switch this parameter to {@code "1.1"}.
*/
public static final String PARAM_VERSION = "version";
@ConfigurationParameter(name = PARAM_VERSION, mandatory = true, defaultValue = "1.0")
@ConfigurationParameter(name = PARAM_VERSION, defaultValue = "1.0")
private String version;



/**
* Sanitize illegal characters in the output by replacing them with a space character so offsets do not change.
*/
public static final String PARAM_SANITIZE_ILLEGAL_CHARACTERS = "sanitizeIllegalCharacters";
@ConfigurationParameter(name = PARAM_SANITIZE_ILLEGAL_CHARACTERS, defaultValue = "false")
private boolean sanitizeIllegalCharacters;

private boolean typeSystemWritten;
private XMLSerializer sax2xml;
private XmiCasSerializer xmiCasSerializer;

@Override
public void initialize(UimaContext aContext)
throws ResourceInitializationException
public void initialize(UimaContext aContext) throws ResourceInitializationException
{
super.initialize(aContext);

if (!asList("1.0", "1.1").contains(version)) {
throw new ResourceInitializationException(new IllegalArgumentException(
"Invalid value for parameter version: [" + version + "]"));
}

typeSystemWritten = false;

xmiCasSerializer = new XmiCasSerializer(null);
sax2xml = new XMLSerializer(prettyPrint);
sax2xml.setOutputProperty(OutputKeys.VERSION, version);
}

@Override
public void process(JCas aJCas)
throws AnalysisEngineProcessException
public void process(JCas aJCas) throws AnalysisEngineProcessException
{
try (OutputStream docOS = getOutputStream(aJCas, filenameSuffix)) {
XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
XMLSerializer sax2xml = new XMLSerializer(docOS, prettyPrint);
sax2xml.setOutputProperty(OutputKeys.VERSION, version);
xmiCasSerializer.serialize(aJCas.getCas(), sax2xml.getContentHandler(), null, null,
null);
try (var docOS = getOutputStream(aJCas, filenameSuffix)) {
sax2xml.setOutputStream(docOS);

var contentHandler = sax2xml.getContentHandler();
if (sanitizeIllegalCharacters) {
contentHandler = new IllegalXmlCharacterSanitizingContentHandler(contentHandler,
version);
}

xmiCasSerializer.serialize(aJCas.getCas(), contentHandler, null, null, null);

if (!typeSystemWritten) {
writeTypeSystem(aJCas);
Expand All @@ -134,24 +145,19 @@ public void process(JCas aJCas)
}
}

private void writeTypeSystem(JCas aJCas)
throws IOException, CASRuntimeException, SAXException
private void writeTypeSystem(JCas aJCas) throws IOException, CASRuntimeException, SAXException
{
@SuppressWarnings("resource")
OutputStream typeOS = null;

try {
if (typeSystemFile != null) {
typeOS = CompressionUtils.getOutputStream(typeSystemFile);
}
else {
typeOS = getOutputStream("TypeSystem", ".xml");
}

try (var typeOS = getTypeSystemOutputStream()) {
TypeSystemUtil.typeSystem2TypeSystemDescription(aJCas.getTypeSystem()).toXML(typeOS);
}
finally {
closeQuietly(typeOS);
}

private OutputStream getTypeSystemOutputStream() throws IOException
{
if (typeSystemFile != null) {
return CompressionUtils.getOutputStream(typeSystemFile);
}

return getOutputStream("TypeSystem", ".xml");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.io.xmi.internal;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
* Replaces characters which are illegal in XML 1.0 or XML 1.1 with a replacement character. The
* characters are replaced in text nodes as well as in attribute values.
*/
public class IllegalXmlCharacterSanitizingContentHandler
implements ContentHandler
{
private boolean xml11 = false;
private char replacementChar = ' ';
private final ContentHandler delegate;

public IllegalXmlCharacterSanitizingContentHandler(ContentHandler aDelegate)
{
delegate = aDelegate;
}

public IllegalXmlCharacterSanitizingContentHandler(ContentHandler aDelegate, String aVersion)
{
delegate = aDelegate;
setXml11("1.1".equals(aVersion));
}

public void setXml11(boolean aXml11)
{
xml11 = aXml11;
}

public void setReplacementChar(char aReplacementChar)
{
replacementChar = aReplacementChar;
}

@Override
public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts)
throws SAXException
{
var newAtts = new AttributesImpl();
for (int i = 0; i < aAtts.getLength(); i++) {
var uri = aAtts.getURI(i);
var localName = aAtts.getLocalName(i);
var qName = aAtts.getQName(i);
var type = aAtts.getType(i);
var value = sanitizeIllegalXmlCharacters(aAtts.getValue(i));
newAtts.addAttribute(uri, localName, qName, type, value);
}

delegate.startElement(aUri, aLocalName, aQName, newAtts);
}

@Override
public void characters(char[] aCh, int aStart, int aLength) throws SAXException
{
String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength));
delegate.characters(s.toCharArray(), 0, s.length());
}

@Override
public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException
{
String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength));
delegate.ignorableWhitespace(s.toCharArray(), 0, s.length());
}

private String sanitizeIllegalXmlCharacters(String aText)
{
char[] chars = aText.toCharArray();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if ((c >= 0xD800) && (c <= 0xDBFF)) {
// The case for Unicode code points #x10000-#x10FFFF. Check if a high surrogate is
// followed by a low surrogate, which is the only allowable combination.
int iNext = i + 1;
if (iNext < chars.length) {
char cNext = chars[iNext];
if (!((cNext >= 0xDC00) && (cNext <= 0xDFFF))) {
chars[i] = replacementChar;
continue;
}
else {
i++;
continue;
}
}
}

if (!isValidXmlUtf16int(c)) {
// Replace invalid UTF-16 codepoints
chars[i] = replacementChar;
}
}

return new String(chars);
}

private boolean isValidXmlUtf16int(char c)
{
if (xml11) {
return (c >= 0x1 && c <= 0xD7FF) || (c >= 0xE000) && (c <= 0xFFFD);
}
else {
return ((c == 0x9) || (c == 0xA) || (c == 0xD) || ((c >= 0x20) && (c <= 0xD7FF))
|| (c >= 0xE000 && c <= 0xFFFD));
}
}

@Override
public void setDocumentLocator(Locator aLocator)
{
delegate.setDocumentLocator(aLocator);
}

@Override
public void startDocument() throws SAXException
{
delegate.startDocument();
}

@Override
public void endDocument() throws SAXException
{
delegate.endDocument();
}

@Override
public void startPrefixMapping(String aPrefix, String aUri) throws SAXException
{
delegate.startPrefixMapping(aPrefix, aUri);
}

@Override
public void endPrefixMapping(String aPrefix) throws SAXException
{
delegate.endPrefixMapping(aPrefix);
}

@Override
public void endElement(String aUri, String aLocalName, String aQName) throws SAXException
{
delegate.endElement(aUri, aLocalName, aQName);
}

@Override
public void processingInstruction(String aTarget, String aData) throws SAXException
{
delegate.processingInstruction(aTarget, aData);
}

@Override
public void skippedEntity(String aName) throws SAXException
{
delegate.skippedEntity(aName);
}
}
Loading