Skip to content

Commit

Permalink
Merge branch 'release/28.x' into bugfix/1496-Some-spans-are-missing-b…
Browse files Browse the repository at this point in the history
…egin-offset-field

* release/28.x:
  #1511 - External recommender fails when CAS contains control characters
  • Loading branch information
reckart committed May 30, 2023
2 parents 3b33be0 + 16289c6 commit aff80b7
Show file tree
Hide file tree
Showing 5 changed files with 312 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
</module>

<module name="TreeWalker">
<module name="SuppressionCommentFilter"/>

<module name="OuterTypeFilename"/>
<module name="IllegalTokenText">
<property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import de.tudarmstadt.ukp.inception.recommendation.imls.external.v1.model.Document;
import de.tudarmstadt.ukp.inception.recommendation.imls.external.v1.model.Metadata;
import de.tudarmstadt.ukp.inception.rendering.model.Range;
import de.tudarmstadt.ukp.inception.support.xml.sanitizer.IllegalXmlCharacterSanitizingContentHandler;

public class ExternalRecommender
extends RecommendationEngine
Expand Down Expand Up @@ -194,7 +195,7 @@ public Range predict(RecommenderContext aContext, CAS aCas, int aBegin, int aEnd
throw new RecommendationException("Error while deserializing CAS!", e);
}

return new Range(aCas);
return Range.rangeCoveringDocument(aCas);
}

private String serializeTypeSystem(CAS aCas) throws RecommendationException
Expand All @@ -210,13 +211,13 @@ private String serializeTypeSystem(CAS aCas) throws RecommendationException

private String serializeCas(CAS aCas) throws RecommendationException
{
try (StringWriter out = new StringWriter()) {
try (var out = new StringWriter()) {
// Passing "null" as the type system to the XmiCasSerializer means that we want
// to serialize all types (i.e. no filtering for a specific target type system).
XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null);
XMLSerializer sax2xml = new XMLSerializer(out, true);
xmiCasSerializer.serialize(getRealCas(aCas), sax2xml.getContentHandler(), null, null,
null);
var contentHandler = new XMLSerializer(out, true).getContentHandler();
contentHandler = new IllegalXmlCharacterSanitizingContentHandler(contentHandler);
xmiCasSerializer.serialize(getRealCas(aCas), contentHandler, null, null, null);
return out.toString();
}
catch (CASRuntimeException | SAXException | IOException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ public class ContentHandlerAdapter

protected final ContentHandler delegate;

public ContentHandlerAdapter()
{
delegate = null;
}

public ContentHandlerAdapter(ContentHandler aDelegate)
{
delegate = aDelegate;
Expand All @@ -45,30 +50,50 @@ public ContentHandlerAdapter(ContentHandler aDelegate)
@Override
public void setDocumentLocator(Locator aLocator)
{
if (delegate == null) {
return;
}

delegate.setDocumentLocator(aLocator);
}

@Override
public void startDocument() throws SAXException
{
if (delegate == null) {
return;
}

delegate.startDocument();
}

@Override
public void endDocument() throws SAXException
{
if (delegate == null) {
return;
}

delegate.endDocument();
}

@Override
public void startPrefixMapping(String aPrefix, String aUri) throws SAXException
{
if (delegate == null) {
return;
}

delegate.startPrefixMapping(aPrefix, aUri);
}

@Override
public void endPrefixMapping(String aPrefix) throws SAXException
{
if (delegate == null) {
return;
}

delegate.endPrefixMapping(aPrefix);
}

Expand Down Expand Up @@ -117,6 +142,10 @@ public void startElement(String aLocalName, Map<String, String> aAttributes) thr
public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts)
throws SAXException
{
if (delegate == null) {
return;
}

delegate.startElement(aUri, aLocalName, aQName, aAtts);
}

Expand All @@ -133,6 +162,10 @@ public void endElement(QName aElement) throws SAXException
@Override
public void endElement(String aUri, String aLocalName, String aQName) throws SAXException
{
if (delegate == null) {
return;
}

delegate.endElement(aUri, aLocalName, aQName);
}

Expand All @@ -144,24 +177,40 @@ public void characters(String aString) throws SAXException
@Override
public void characters(char[] aCh, int aStart, int aLength) throws SAXException
{
if (delegate == null) {
return;
}

delegate.characters(aCh, aStart, aLength);
}

@Override
public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException
{
if (delegate == null) {
return;
}

delegate.ignorableWhitespace(aCh, aStart, aLength);
}

@Override
public void processingInstruction(String aTarget, String aData) throws SAXException
{
if (delegate == null) {
return;
}

delegate.processingInstruction(aTarget, aData);
}

@Override
public void skippedEntity(String aName) throws SAXException
{
if (delegate == null) {
return;
}

delegate.skippedEntity(aName);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.support.xml.sanitizer;

import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

import de.tudarmstadt.ukp.clarin.webanno.support.xml.ContentHandlerAdapter;

/**
* Replaces characters which are illegal in XML 1.0 or XML 1.1 with a replacement character. The
* characters are replaced in text nodes as well as in attribute values.
*/
public class IllegalXmlCharacterSanitizingContentHandler
extends ContentHandlerAdapter
{
private boolean xml11 = false;
private char replacementChar = ' ';

public IllegalXmlCharacterSanitizingContentHandler(ContentHandler aDelegate)
{
super(aDelegate);
}

public void setXml11(boolean aXml11)
{
xml11 = aXml11;
}

public void setReplacementChar(char aReplacementChar)
{
replacementChar = aReplacementChar;
}

@Override
public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts)
throws SAXException
{
var newAtts = new AttributesImpl();
for (int i = 0; i < aAtts.getLength(); i++) {
var uri = aAtts.getURI(i);
var localName = aAtts.getLocalName(i);
var qName = aAtts.getQName(i);
var type = aAtts.getType(i);
var value = sanitizeIllegalXmlCharacters(aAtts.getValue(i));
newAtts.addAttribute(uri, localName, qName, type, value);
}

super.startElement(aUri, aLocalName, aQName, newAtts);
}

@Override
public void characters(char[] aCh, int aStart, int aLength) throws SAXException
{
String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength));
delegate.characters(s.toCharArray(), 0, s.length());
}

@Override
public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException
{
String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength));
delegate.ignorableWhitespace(s.toCharArray(), 0, s.length());
}

private String sanitizeIllegalXmlCharacters(String aText)
{
char[] chars = aText.toCharArray();
for (int i = 0; i < chars.length; i++) {
char c = chars[i];
if ((c >= 0xD800) && (c <= 0xDBFF)) {
// The case for Unicode code points #x10000-#x10FFFF. Check if a high surrogate is
// followed by a low surrogate, which is the only allowable combination.
int iNext = i + 1;
if (iNext < chars.length) {
char cNext = chars[iNext];
if (!((cNext >= 0xDC00) && (cNext <= 0xDFFF))) {
chars[i] = replacementChar;
continue;
}
else {
i++;
continue;
}
}
}

if (!isValidXmlUtf16int(c)) {
// Replace invalid UTF-16 codepoints
chars[i] = replacementChar;
}
}

return new String(chars);
}

private boolean isValidXmlUtf16int(char c)
{
if (xml11) {
return (c >= 0x1 && c <= 0xD7FF) || (c >= 0xE000) && (c <= 0xFFFD);
}
else {
return ((c == 0x9) || (c == 0xA) || (c == 0xD) || ((c >= 0x20) && (c <= 0xD7FF))
|| (c >= 0xE000 && c <= 0xFFFD));
}
}
}
Loading

0 comments on commit aff80b7

Please sign in to comment.