diff --git a/inception/inception-build/src/main/resources/inception/checkstyle.xml b/inception/inception-build/src/main/resources/inception/checkstyle.xml index e5e6fd44833..64e0fcbc3a0 100644 --- a/inception/inception-build/src/main/resources/inception/checkstyle.xml +++ b/inception/inception-build/src/main/resources/inception/checkstyle.xml @@ -38,6 +38,8 @@ + + diff --git a/inception/inception-imls-external/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/external/v1/ExternalRecommender.java b/inception/inception-imls-external/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/external/v1/ExternalRecommender.java index 2de24b30bd6..3d5abe5c0af 100644 --- a/inception/inception-imls-external/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/external/v1/ExternalRecommender.java +++ b/inception/inception-imls-external/src/main/java/de/tudarmstadt/ukp/inception/recommendation/imls/external/v1/ExternalRecommender.java @@ -72,6 +72,7 @@ import de.tudarmstadt.ukp.inception.recommendation.imls.external.v1.model.Document; import de.tudarmstadt.ukp.inception.recommendation.imls.external.v1.model.Metadata; import de.tudarmstadt.ukp.inception.rendering.model.Range; +import de.tudarmstadt.ukp.inception.support.xml.sanitizer.IllegalXmlCharacterSanitizingContentHandler; public class ExternalRecommender extends RecommendationEngine @@ -194,7 +195,7 @@ public Range predict(RecommenderContext aContext, CAS aCas, int aBegin, int aEnd throw new RecommendationException("Error while deserializing CAS!", e); } - return new Range(aCas); + return Range.rangeCoveringDocument(aCas); } private String serializeTypeSystem(CAS aCas) throws RecommendationException @@ -210,13 +211,13 @@ private String serializeTypeSystem(CAS aCas) throws RecommendationException private String serializeCas(CAS aCas) throws RecommendationException { - try (StringWriter out = new StringWriter()) { + try (var out = new StringWriter()) { // Passing "null" as the type system to the XmiCasSerializer means that we want // to serialize all types (i.e. no filtering for a specific target type system). XmiCasSerializer xmiCasSerializer = new XmiCasSerializer(null); - XMLSerializer sax2xml = new XMLSerializer(out, true); - xmiCasSerializer.serialize(getRealCas(aCas), sax2xml.getContentHandler(), null, null, - null); + var contentHandler = new XMLSerializer(out, true).getContentHandler(); + contentHandler = new IllegalXmlCharacterSanitizingContentHandler(contentHandler); + xmiCasSerializer.serialize(getRealCas(aCas), contentHandler, null, null, null); return out.toString(); } catch (CASRuntimeException | SAXException | IOException e) { diff --git a/inception/inception-io-json/pom.xml b/inception/inception-io-json/pom.xml index c76d8fdda21..564aa96ddda 100644 --- a/inception/inception-io-json/pom.xml +++ b/inception/inception-io-json/pom.xml @@ -50,6 +50,10 @@ org.springframework spring-beans + + org.springframework.boot + spring-boot + org.springframework.boot spring-boot-autoconfigure diff --git a/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/LegacyUimaJsonFormatSupport.java b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/LegacyUimaJsonFormatSupport.java index 06684d9d259..ef073e48632 100644 --- a/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/LegacyUimaJsonFormatSupport.java +++ b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/LegacyUimaJsonFormatSupport.java @@ -27,6 +27,7 @@ import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport; import de.tudarmstadt.ukp.clarin.webanno.model.Project; +import de.tudarmstadt.ukp.inception.io.jsoncas.config.LegacyUimaJsonCasFormatProperties; public class LegacyUimaJsonFormatSupport implements FormatSupport @@ -34,6 +35,13 @@ public class LegacyUimaJsonFormatSupport public static final String ID = "json"; public static final String NAME = "UIMA CAS JSON (legacy)"; + private final LegacyUimaJsonCasFormatProperties properties; + + public LegacyUimaJsonFormatSupport(LegacyUimaJsonCasFormatProperties aProps) + { + properties = aProps; + } + @Override public String getId() { @@ -57,6 +65,7 @@ public AnalysisEngineDescription getWriterDescription(Project aProject, TypeSystemDescription aTSD, CAS aCAS) throws ResourceInitializationException { - return createEngineDescription(JsonWriter.class, aTSD); + return createEngineDescription(JsonWriter.class, aTSD, // + JsonWriter.PARAM_OMIT_DEFAULT_VALUES, properties.isOmitDefaultValues()); } } diff --git a/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/LegacyUimaJsonCasFormatProperties.java b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/LegacyUimaJsonCasFormatProperties.java new file mode 100644 index 00000000000..2ed7bd2eff7 --- /dev/null +++ b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/LegacyUimaJsonCasFormatProperties.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.io.jsoncas.config; + +import org.springframework.boot.context.properties.ConfigurationProperties; + +@ConfigurationProperties("format.json-cas-legacy") +public class LegacyUimaJsonCasFormatProperties +{ + private boolean omitDefaultValues = false; + + public void setOmitDefaultValues(boolean aOmitDefaultValues) + { + omitDefaultValues = aOmitDefaultValues; + } + + public boolean isOmitDefaultValues() + { + return omitDefaultValues; + } +} diff --git a/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/UimaJsonCasSupportAutoConfiguration.java b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/UimaJsonCasSupportAutoConfiguration.java index a4c53297865..c4b3256f651 100644 --- a/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/UimaJsonCasSupportAutoConfiguration.java +++ b/inception/inception-io-json/src/main/java/de/tudarmstadt/ukp/inception/io/jsoncas/config/UimaJsonCasSupportAutoConfiguration.java @@ -18,6 +18,7 @@ package de.tudarmstadt.ukp.inception.io.jsoncas.config; import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.boot.context.properties.EnableConfigurationProperties; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; @@ -26,14 +27,16 @@ import de.tudarmstadt.ukp.inception.io.jsoncas.UimaJsonCasFormatSupport; @Configuration +@EnableConfigurationProperties(LegacyUimaJsonCasFormatProperties.class) public class UimaJsonCasSupportAutoConfiguration { @ConditionalOnProperty(prefix = "format.json-cas-legacy", name = "enabled", // havingValue = "true", matchIfMissing = false) @Bean - public LegacyUimaJsonFormatSupport legacyUimaJsonFormatSupport() + public LegacyUimaJsonFormatSupport legacyUimaJsonFormatSupport( + LegacyUimaJsonCasFormatProperties aProps) { - return new LegacyUimaJsonFormatSupport(); + return new LegacyUimaJsonFormatSupport(aProps); } @ConditionalOnProperty(prefix = "format.json-cas", name = "enabled", // diff --git a/inception/inception-io-json/src/main/resources/META-INF/asciidoc/user-guide/formats-uimajson-legacy.adoc b/inception/inception-io-json/src/main/resources/META-INF/asciidoc/user-guide/formats-uimajson-legacy.adoc index f3f1467bd84..263356b2a72 100644 --- a/inception/inception-io-json/src/main/resources/META-INF/asciidoc/user-guide/formats-uimajson-legacy.adoc +++ b/inception/inception-io-json/src/main/resources/META-INF/asciidoc/user-guide/formats-uimajson-legacy.adoc @@ -23,7 +23,6 @@ CAUTION: Legacy feature. To use this functionality, you need to enable it first Support for this feature will be removed in a future version. The replacement is <>. ==== - This is an old and deprecated UIMA CAS JSON format which can be exported but not imported. It should no longer be used. Instead, one should turn to <>. @@ -31,6 +30,10 @@ The format does support custom layers. For more details on this format, please refer to the link:https://uima.apache.org/d/uimaj-current/references.html#ugr.ref.json[UIMA Reference Guide]. +By default, the format writes all values to the JSON output, even if the values are the default values +in JSON (e.g. `0` for numbers or `false` for booleans). You can configure this behavior by setting +`format.json-cas-legacy.omit-default-values` to `true` or `false` (default) respectively. + [cols="2,1,1,1,3"] |==== | Format | Read | Write | Custom Layers | Description @@ -41,4 +44,3 @@ For more details on this format, please refer to the link:https://uima.apache.or | yes | UIMA CAS JSON (legacy) |==== - diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/clarin/webanno/support/xml/ContentHandlerAdapter.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/clarin/webanno/support/xml/ContentHandlerAdapter.java index 60425e24285..effbcd0bad0 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/clarin/webanno/support/xml/ContentHandlerAdapter.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/clarin/webanno/support/xml/ContentHandlerAdapter.java @@ -37,6 +37,11 @@ public class ContentHandlerAdapter protected final ContentHandler delegate; + public ContentHandlerAdapter() + { + delegate = null; + } + public ContentHandlerAdapter(ContentHandler aDelegate) { delegate = aDelegate; @@ -45,30 +50,50 @@ public ContentHandlerAdapter(ContentHandler aDelegate) @Override public void setDocumentLocator(Locator aLocator) { + if (delegate == null) { + return; + } + delegate.setDocumentLocator(aLocator); } @Override public void startDocument() throws SAXException { + if (delegate == null) { + return; + } + delegate.startDocument(); } @Override public void endDocument() throws SAXException { + if (delegate == null) { + return; + } + delegate.endDocument(); } @Override public void startPrefixMapping(String aPrefix, String aUri) throws SAXException { + if (delegate == null) { + return; + } + delegate.startPrefixMapping(aPrefix, aUri); } @Override public void endPrefixMapping(String aPrefix) throws SAXException { + if (delegate == null) { + return; + } + delegate.endPrefixMapping(aPrefix); } @@ -117,6 +142,10 @@ public void startElement(String aLocalName, Map aAttributes) thr public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts) throws SAXException { + if (delegate == null) { + return; + } + delegate.startElement(aUri, aLocalName, aQName, aAtts); } @@ -133,6 +162,10 @@ public void endElement(QName aElement) throws SAXException @Override public void endElement(String aUri, String aLocalName, String aQName) throws SAXException { + if (delegate == null) { + return; + } + delegate.endElement(aUri, aLocalName, aQName); } @@ -144,24 +177,40 @@ public void characters(String aString) throws SAXException @Override public void characters(char[] aCh, int aStart, int aLength) throws SAXException { + if (delegate == null) { + return; + } + delegate.characters(aCh, aStart, aLength); } @Override public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException { + if (delegate == null) { + return; + } + delegate.ignorableWhitespace(aCh, aStart, aLength); } @Override public void processingInstruction(String aTarget, String aData) throws SAXException { + if (delegate == null) { + return; + } + delegate.processingInstruction(aTarget, aData); } @Override public void skippedEntity(String aName) throws SAXException { + if (delegate == null) { + return; + } + delegate.skippedEntity(aName); } } diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandler.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandler.java new file mode 100644 index 00000000000..625335b2db5 --- /dev/null +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandler.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.support.xml.sanitizer; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; + +import de.tudarmstadt.ukp.clarin.webanno.support.xml.ContentHandlerAdapter; + +/** + * Replaces characters which are illegal in XML 1.0 or XML 1.1 with a replacement character. The + * characters are replaced in text nodes as well as in attribute values. + */ +public class IllegalXmlCharacterSanitizingContentHandler + extends ContentHandlerAdapter +{ + private boolean xml11 = false; + private char replacementChar = ' '; + + public IllegalXmlCharacterSanitizingContentHandler(ContentHandler aDelegate) + { + super(aDelegate); + } + + public void setXml11(boolean aXml11) + { + xml11 = aXml11; + } + + public void setReplacementChar(char aReplacementChar) + { + replacementChar = aReplacementChar; + } + + @Override + public void startElement(String aUri, String aLocalName, String aQName, Attributes aAtts) + throws SAXException + { + var newAtts = new AttributesImpl(); + for (int i = 0; i < aAtts.getLength(); i++) { + var uri = aAtts.getURI(i); + var localName = aAtts.getLocalName(i); + var qName = aAtts.getQName(i); + var type = aAtts.getType(i); + var value = sanitizeIllegalXmlCharacters(aAtts.getValue(i)); + newAtts.addAttribute(uri, localName, qName, type, value); + } + + super.startElement(aUri, aLocalName, aQName, newAtts); + } + + @Override + public void characters(char[] aCh, int aStart, int aLength) throws SAXException + { + String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength)); + delegate.characters(s.toCharArray(), 0, s.length()); + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException + { + String s = sanitizeIllegalXmlCharacters(new String(aCh, aStart, aLength)); + delegate.ignorableWhitespace(s.toCharArray(), 0, s.length()); + } + + private String sanitizeIllegalXmlCharacters(String aText) + { + char[] chars = aText.toCharArray(); + for (int i = 0; i < chars.length; i++) { + char c = chars[i]; + if ((c >= 0xD800) && (c <= 0xDBFF)) { + // The case for Unicode code points #x10000-#x10FFFF. Check if a high surrogate is + // followed by a low surrogate, which is the only allowable combination. + int iNext = i + 1; + if (iNext < chars.length) { + char cNext = chars[iNext]; + if (!((cNext >= 0xDC00) && (cNext <= 0xDFFF))) { + chars[i] = replacementChar; + continue; + } + else { + i++; + continue; + } + } + } + + if (!isValidXmlUtf16int(c)) { + // Replace invalid UTF-16 codepoints + chars[i] = replacementChar; + } + } + + return new String(chars); + } + + private boolean isValidXmlUtf16int(char c) + { + if (xml11) { + return (c >= 0x1 && c <= 0xD7FF) || (c >= 0xE000) && (c <= 0xFFFD); + } + else { + return ((c == 0x9) || (c == 0xA) || (c == 0xD) || ((c >= 0x20) && (c <= 0xD7FF)) + || (c >= 0xE000 && c <= 0xFFFD)); + } + } +} diff --git a/inception/inception-support/src/test/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandlerTest.java b/inception/inception-support/src/test/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandlerTest.java new file mode 100644 index 00000000000..6a563b8abeb --- /dev/null +++ b/inception/inception-support/src/test/java/de/tudarmstadt/ukp/inception/support/xml/sanitizer/IllegalXmlCharacterSanitizingContentHandlerTest.java @@ -0,0 +1,131 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package de.tudarmstadt.ukp.inception.support.xml.sanitizer; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.junit.jupiter.api.Test; +import org.xml.sax.SAXException; + +import de.tudarmstadt.ukp.clarin.webanno.support.xml.ContentHandlerAdapter; + +//CHECKSTYLE:OFF +class IllegalXmlCharacterSanitizingContentHandlerTest +{ + @Test + void testXml10() throws Exception + { + var stringCollector = new ContentToString(); + var adapter = new IllegalXmlCharacterSanitizingContentHandler(stringCollector); + adapter.setXml11(false); + adapter.setReplacementChar('\uFFFD'); + + char[] input = { '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', + '\u0007', '\u0008', '\u0009', '\n', '\u000b', '\u000c', '\r', '\u000e', '\u000f', + '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', + '\u0018', '\u0019', '\u001a', '\u001b', '\u001c', '\u001d', '\u001e', '\u001f', + '\u0020', '\uD800' }; + adapter.characters(input, 0, input.length); + assertThat(stringCollector.toString().toCharArray()).hasSameSizeAs(input); + assertThat(stringCollector.toString().toCharArray()).containsExactly('\uFFFD', '\uFFFD', + '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\u0009', + '\n', '\uFFFD', '\uFFFD', '\r', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', + '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', + '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\uFFFD', '\u0020', '\uFFFD'); + } + + @Test + void testXml11() throws Exception + { + var stringCollector = new ContentToString(); + var adapter = new IllegalXmlCharacterSanitizingContentHandler(stringCollector); + adapter.setXml11(true); + adapter.setReplacementChar('\uFFFD'); + + char[] input = { '\u0000', '\u0001', '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', + '\u0007', '\u0008', '\u0009', '\n', '\u000b', '\u000c', '\r', '\u000e', '\u000f', + '\u0010', '\u0011', '\u0012', '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', + '\u0018', '\u0019', '\u001a', '\u001b', '\u001c', '\u001d', '\u001e', '\u001f', + '\u0020', '\uD800' }; + adapter.characters(input, 0, input.length); + assertThat(stringCollector.toString().toCharArray()).hasSameSizeAs(input); + assertThat(stringCollector.toString().toCharArray()).containsExactly('\uFFFD', '\u0001', + '\u0002', '\u0003', '\u0004', '\u0005', '\u0006', '\u0007', '\u0008', '\u0009', + '\n', '\u000b', '\u000c', '\r', '\u000e', '\u000f', '\u0010', '\u0011', '\u0012', + '\u0013', '\u0014', '\u0015', '\u0016', '\u0017', '\u0018', '\u0019', '\u001a', + '\u001b', '\u001c', '\u001d', '\u001e', '\u001f', '\u0020', '\uFFFD'); + } + + @Test + void testWithSurrogate() throws Exception + { + var stringCollector = new ContentToString(); + var adapter = new IllegalXmlCharacterSanitizingContentHandler(stringCollector); + adapter.setXml11(false); + adapter.setReplacementChar('\uFFFD'); + + var input = "🙋🏽‍♀️"; + adapter.characters(input); + assertThat(stringCollector.toString()).hasSameSizeAs(input); + assertThat(stringCollector.toString()).isEqualTo("🙋🏽‍♀️"); + } + + @Test + void testWithBrokenSurrogate() throws Exception + { + var stringCollector = new ContentToString(); + var adapter = new IllegalXmlCharacterSanitizingContentHandler(stringCollector); + adapter.setXml11(false); + adapter.setReplacementChar('\uFFFD'); + + char[] input = { '\ude4b', '\ud83d' }; + adapter.characters(input, 0, input.length); + assertThat(stringCollector.toString()).hasSameSizeAs(input); + assertThat(stringCollector.toString().toCharArray()).containsExactly('\uFFFD', '\uFFFD'); + } + + private static class ContentToString + extends ContentHandlerAdapter + { + private final StringBuilder text = new StringBuilder(); + + @Override + public void startDocument() throws SAXException + { + text.setLength(0); + } + + @Override + public void characters(char[] aCh, int aStart, int aLength) throws SAXException + { + text.append(aCh, aStart, aLength); + } + + @Override + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException + { + text.append(aCh, aStart, aLength); + } + + @Override + public String toString() + { + return text.toString(); + } + } +}