Merge branch '0.14.x'

* 0.14.x: #1587 - Disable fact linking support #1587 - Disable fact linking support No issue. Depend on WebAnno 4.0.0 beta 9 for next release. #1587 - Disable fact linking support #1582 - Support IMS CWB format #1582 - Support IMS CWB format #1581 - Update and modularize format documentation % Conflicts: % pom.xml
inception-project · Dec 31, 2019 · 01024d9 · 01024d9
2 parents 830cdc2 + a57fff1
commit 01024d9
Show file tree

Hide file tree

Showing 12 changed files with 441 additions and 108 deletions.
diff --git a/inception-app-webapp/pom.xml b/inception-app-webapp/pom.xml
@@ -200,6 +200,10 @@
       <groupId>org.dkpro.core</groupId>
       <artifactId>dkpro-core-io-xml-asl</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.dkpro.core</groupId>
+      <artifactId>dkpro-core-io-imscwb-asl</artifactId>
+    </dependency>
 
     <!-- WEBANNO DEPENDENCIES -->
     <dependency>

diff --git a/...on-app-webapp/src/main/java/de/tudarmstadt/ukp/inception/formats/ImsCwbFormatSupport.java b/...on-app-webapp/src/main/java/de/tudarmstadt/ukp/inception/formats/ImsCwbFormatSupport.java
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2019
+ * Ubiquitous Knowledge Processing (UKP) Lab
+ * Technische Universität Darmstadt
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package de.tudarmstadt.ukp.inception.formats;
+
+import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;
+
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.dkpro.core.io.imscwb.ImsCwbReader;
+import org.springframework.stereotype.Component;
+
+import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;
+
+@Component
+public class ImsCwbFormatSupport
+    implements FormatSupport
+{
+    public static final String ID = "imscwb";
+    public static final String NAME = "Corpus Workbench Format (aka VRT)";
+
+    @Override
+    public String getId()
+    {
+        return ID;
+    }
+
+    @Override
+    public String getName()
+    {
+        return NAME;
+    }
+
+    @Override
+    public boolean isReadable()
+    {
+        return true;
+    }
+
+//    @Override
+//    public boolean isWritable()
+//    {
+//        return true;
+//    }
+
+    @Override
+    public CollectionReaderDescription getReaderDescription() throws ResourceInitializationException
+    {
+        return createReaderDescription(ImsCwbReader.class);
+    }
+
+//    @Override
+//    public AnalysisEngineDescription getWriterDescription(Project aProject, CAS aCAS)
+//        throws ResourceInitializationException
+//    {
+//        return createEngineDescription(ImsCwbWriter.class);
+//    }
+}
diff --git a/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc b/inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc
@@ -91,7 +91,7 @@ include::{include-dir}annotation_activeLearning.adoc[leveloffset=+2]
 
 include::{include-dir}annotation_concept-linking.adoc[leveloffset=+2]
 
-include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]
+// include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]
 
 include::{include-dir}annotation_images.adoc[leveloffset=+2]
 
@@ -192,12 +192,54 @@ include::{include-dir}pdf-editor.adoc[leveloffset=+1]
 = Appendices
 
 [appendix]
-include::{include-dir}webannotsv.adoc[leveloffset=+1]
+include::{include-dir}formats.adoc[leveloffset=+1]
+
+include::{include-dir}formats-conll2000.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conll2002.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conll2003.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conll2006.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conll2009.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conll2012.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conllcorenlp.adoc[leveloffset=+2]
+
+include::{include-dir}formats-conllu.adoc[leveloffset=+2]
+
+include::{include-dir}formats-imscwb.adoc[leveloffset=+2]
+
+include::{include-dir}formats-inlinexml.adoc[leveloffset=+2]
+
+include::{include-dir}formats-lif.adoc[leveloffset=+2]
+
+include::{include-dir}formats-nif.adoc[leveloffset=+2]
+
+include::{include-dir}formats-perseus.adoc[leveloffset=+2]
+
+include::{include-dir}formats-tcf.adoc[leveloffset=+2]
+
+include::{include-dir}formats-tei.adoc[leveloffset=+2]
+
+include::{include-dir}formats-text.adoc[leveloffset=+2]
+
+include::{include-dir}formats-uimabinarycas.adoc[leveloffset=+2]
+
+include::{include-dir}formats-uimaxmi.adoc[leveloffset=+2]
+
+include::{include-dir}formats-webannotsv1.adoc[leveloffset=+2]
+
+include::{include-dir}formats-webannotsv2.adoc[leveloffset=+2]
+
+include::{include-dir}formats-webannotsv3.adoc[leveloffset=+2]
 
 <<<
 
 [appendix]
-include::{include-dir}formats.adoc[leveloffset=+1]
+include::{include-dir}webannotsv.adoc[leveloffset=+1]
 
 <<<
 

diff --git a/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-imscwb.adoc b/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-imscwb.adoc
@@ -0,0 +1,76 @@
+// Copyright 2019
+// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
+// Technische Universität Darmstadt
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+[[sect_formats_imscwb]]
+= IMS CWB (aka VRT)
+
+The "verticalized XML" format used by the link:http://cwb.sourceforge.net[IMS Open Corpus Workbench], 
+a linguistic search engine. It uses a tab-separated format with limited markup (e.g. for sentences, 
+documents, but not recursive structures like parse-trees). In principle, it is a generic format - 
+i.e. there can be arbitrary columns, pseudo-XML elements and attributes. However, support is limited
+to a specific set of columns that must appear exactly in a specific order: *token text*, 
+*part-of-speech tag*, *lemma*. Also only specific pseudo-XML elements and attributes are supported:
+`text` (including an `id` attribute), `s`.
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| IMS CWB VRT
+| yes
+| no
+| no
+|
+|====
+
+.Example
+[source,text]
+----
+<text id="http://www.epguides.de/nikita.htm">
+<s>
+Nikita	NE	Nikita
+(	$(	(
+La	FM	La
+Femme	NN	Femme
+Nikita	NE	Nikita
+)	$(	)
+Dieser	PDS	dies
+Episodenführer	NN	Episodenführer
+wurde	VAFIN	werden
+von	APPR	von
+September	NN	September
+1998	CARD	1998
+bis	APPR	bis
+Mai	NN	Mai
+1999	CARD	1999
+von	APPR	von
+Konstantin	NE	Konstantin
+C.W.	NE	C.W.
+Volkmann	NE	Volkmann
+geschrieben	VVPP	schreiben
+und	KON	und
+im	APPRART	im
+Mai	NN	Mai
+2000	CARD	2000
+von	APPR	von
+Stefan	NE	Stefan
+Börzel	NN	Börzel
+übernommen	VVPP	übernehmen
+.	$.	.
+</s>
+</text>
+----
+
diff --git a/...tion-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-inlinexml.adoc b/...tion-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-inlinexml.adoc
@@ -0,0 +1,32 @@
+// Copyright 2019
+// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
+// Technische Universität Darmstadt
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+[[sect_formats_inlinexml]]
+= Inline XML
+
+Tries its best to export the annotations into an inline XML representation. Overlapping annotations are not supported in this format and are silently discarded during export.
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| link:https://uima.apache.org/d/uimaj-current/apidocs/org/apache/uima/util/CasToInlineXml.html[UIMA Inline XML]
+| no
+| yes
+| yes
+|
+|====
+
diff --git a/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-lif.adoc b/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-lif.adoc
@@ -0,0 +1,52 @@
+// Copyright 2019
+// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
+// Technische Universität Darmstadt
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+[[sect_formats_lif]]
+= LAPPS Interchange Format
+
+The link:https://wiki.lappsgrid.org/interchange/[LAPPS Interchange Format] (LIF) is a JSON-based format which is used by the link:http://www.lappsgrid.org[Language Applications Grid]. The the format is in principle generic, the support for it is based on the link:http://vocab.lappsgrid.org[LAPPS Web Service Exchange Vocabulary].
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| link:https://wiki.lappsgrid.org/interchange/[LIF]
+| yes
+| yes
+| no
+| 
+|====
+
+.Example
+[source,text]
+----
+{
+  "id": "v2",
+  "metadata": {
+     "contains": {
+       "Token": {
+         "producer": "org.anc.lapps.stanford.SATokenizer:1.4.0",
+         "type": "tokenization:stanford" },
+       "Token#pos": {
+         "producer": "org.anc.lapps.stanford.SATagger:1.4.0",
+         "posTagSet": "penn",
+         "type": "postagging:stanford" }}},
+  "annotations": [
+     { "@type": "Token", "id": "tok0", "start": 0, "end": 4, "features": { "pos": "NNP" } },
+     { "@type": "Token", "id": "tok1", "start": 5, "end": 10, "features": { "pos": "VBZ" } },
+     { "@type": "Token", "id": "tok2", "start": 10, "end": 11, "features": { "pos": "." } } ]
+}
+----
diff --git a/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-nif.adoc b/inception-doc/src/main/resources/META-INF/asciidoc/user-guide/formats-nif.adoc
@@ -0,0 +1,70 @@
+// Copyright 2019
+// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
+// Technische Universität Darmstadt
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+[[sect_formats_nif]]
+= NLP Interchange Format
+
+The link:https://persistence.uni-leipzig.org/nlp2rdf/[NLP Interchange Format] (NIF) provides a way of representing NLP information using semantic web technology, specifically RDF and OWL. A few additions of the format were defined in the apparently in-official link:https://nif.readthedocs.io/en/latest/[NIF 2.1] specification.
+
+[cols="2,1,1,1,3"]
+|====
+| Format | Read | Write | Custom Layers | Description
+
+| link:https://persistence.uni-leipzig.org/nlp2rdf/[NIF]
+| yes
+| yes
+| no
+| 
+|====
+
+.Example
+[source,text]
+----
+@prefix rdfs:  <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix nif:   <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
+@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
+@prefix xsd:   <http://www.w3.org/2001/XMLSchema#> .
+@prefix rdf:   <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+
+<http://example.org/document0#char=0,86>
+        a               nif:RFC5147String , nif:String , nif:Context ;
+        nif:beginIndex  "0"^^xsd:nonNegativeInteger ;
+        nif:endIndex    "86"^^xsd:nonNegativeInteger ;
+        nif:isString    "Japan (Japanese: 日本 Nippon or Nihon) is a stratovolcanic archipelago of 6,852 islands."^^xsd:string ;
+        nif:topic       <http://example.org/document0#annotation0> .
+
+<http://example.org/document0#char=0,5>
+        a                     nif:RFC5147String , nif:String ;
+        nif:anchorOf          "Japan"^^xsd:string ;
+        nif:beginIndex        "0"^^xsd:nonNegativeInteger ;
+        nif:endIndex          "5"^^xsd:nonNegativeInteger ;
+        nif:referenceContext  <http://example.org/document0#char=0,86> ;
+        itsrdf:taClassRef     <http://example.org/Country> , <http://example.org/StratovolcanicArchipelago> ;
+        itsrdf:taIdentRef     <http://example.org/Japan> .
+
+<http://example.org/document0#char=42,68>
+        a                     nif:RFC5147String , nif:String ;
+        nif:anchorOf          "stratovolcanic archipelago"^^xsd:string ;
+        nif:beginIndex        "42"^^xsd:nonNegativeInteger ;
+        nif:endIndex          "68"^^xsd:nonNegativeInteger ;
+        nif:referenceContext  <http://example.org/document0#char=0,86> ;
+        itsrdf:taClassRef     <http://example.org/Archipelago> , rdfs:Class ;
+        itsrdf:taIdentRef     <http://example.org/StratovolcanicArchipelago> .
+
+<http://example.org/document0#annotation0>
+        a                  nif:Annotation ;
+        itsrdf:taIdentRef  <http://example.org/Geography> .
+----