Skip to content

Commit

Permalink
Merge branch '0.14.x'
Browse files Browse the repository at this point in the history
* 0.14.x:
  #1587 - Disable fact linking support
  #1587 - Disable fact linking support
  No issue. Depend on WebAnno 4.0.0 beta 9 for next release.
  #1587 - Disable fact linking support
  #1582 - Support IMS CWB format
  #1582 - Support IMS CWB format
  #1581 - Update and modularize format documentation

% Conflicts:
%	pom.xml
  • Loading branch information
reckart committed Dec 31, 2019
2 parents 830cdc2 + a57fff1 commit 01024d9
Show file tree
Hide file tree
Showing 12 changed files with 441 additions and 108 deletions.
4 changes: 4 additions & 0 deletions inception-app-webapp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-xml-asl</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-imscwb-asl</artifactId>
</dependency>

<!-- WEBANNO DEPENDENCIES -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2019
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.formats;

import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;

import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.io.imscwb.ImsCwbReader;
import org.springframework.stereotype.Component;

import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;

@Component
public class ImsCwbFormatSupport
implements FormatSupport
{
public static final String ID = "imscwb";
public static final String NAME = "Corpus Workbench Format (aka VRT)";

@Override
public String getId()
{
return ID;
}

@Override
public String getName()
{
return NAME;
}

@Override
public boolean isReadable()
{
return true;
}

// @Override
// public boolean isWritable()
// {
// return true;
// }

@Override
public CollectionReaderDescription getReaderDescription() throws ResourceInitializationException
{
return createReaderDescription(ImsCwbReader.class);
}

// @Override
// public AnalysisEngineDescription getWriterDescription(Project aProject, CAS aCAS)
// throws ResourceInitializationException
// {
// return createEngineDescription(ImsCwbWriter.class);
// }
}
48 changes: 45 additions & 3 deletions inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ include::{include-dir}annotation_activeLearning.adoc[leveloffset=+2]

include::{include-dir}annotation_concept-linking.adoc[leveloffset=+2]

include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]
// include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]

include::{include-dir}annotation_images.adoc[leveloffset=+2]

Expand Down Expand Up @@ -192,12 +192,54 @@ include::{include-dir}pdf-editor.adoc[leveloffset=+1]
= Appendices

[appendix]
include::{include-dir}webannotsv.adoc[leveloffset=+1]
include::{include-dir}formats.adoc[leveloffset=+1]

include::{include-dir}formats-conll2000.adoc[leveloffset=+2]

include::{include-dir}formats-conll2002.adoc[leveloffset=+2]

include::{include-dir}formats-conll2003.adoc[leveloffset=+2]

include::{include-dir}formats-conll2006.adoc[leveloffset=+2]

include::{include-dir}formats-conll2009.adoc[leveloffset=+2]

include::{include-dir}formats-conll2012.adoc[leveloffset=+2]

include::{include-dir}formats-conllcorenlp.adoc[leveloffset=+2]

include::{include-dir}formats-conllu.adoc[leveloffset=+2]

include::{include-dir}formats-imscwb.adoc[leveloffset=+2]

include::{include-dir}formats-inlinexml.adoc[leveloffset=+2]

include::{include-dir}formats-lif.adoc[leveloffset=+2]

include::{include-dir}formats-nif.adoc[leveloffset=+2]

include::{include-dir}formats-perseus.adoc[leveloffset=+2]

include::{include-dir}formats-tcf.adoc[leveloffset=+2]

include::{include-dir}formats-tei.adoc[leveloffset=+2]

include::{include-dir}formats-text.adoc[leveloffset=+2]

include::{include-dir}formats-uimabinarycas.adoc[leveloffset=+2]

include::{include-dir}formats-uimaxmi.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv1.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv2.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv3.adoc[leveloffset=+2]

<<<

[appendix]
include::{include-dir}formats.adoc[leveloffset=+1]
include::{include-dir}webannotsv.adoc[leveloffset=+1]

<<<

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Copyright 2019
// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
// Technische Universität Darmstadt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

[[sect_formats_imscwb]]
= IMS CWB (aka VRT)

The "verticalized XML" format used by the link:http://cwb.sourceforge.net[IMS Open Corpus Workbench],
a linguistic search engine. It uses a tab-separated format with limited markup (e.g. for sentences,
documents, but not recursive structures like parse-trees). In principle, it is a generic format -
i.e. there can be arbitrary columns, pseudo-XML elements and attributes. However, support is limited
to a specific set of columns that must appear exactly in a specific order: *token text*,
*part-of-speech tag*, *lemma*. Also only specific pseudo-XML elements and attributes are supported:
`text` (including an `id` attribute), `s`.

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| IMS CWB VRT
| yes
| no
| no
|
|====

.Example
[source,text]
----
<text id="http://www.epguides.de/nikita.htm">
<s>
Nikita NE Nikita
( $( (
La FM La
Femme NN Femme
Nikita NE Nikita
) $( )
Dieser PDS dies
Episodenführer NN Episodenführer
wurde VAFIN werden
von APPR von
September NN September
1998 CARD 1998
bis APPR bis
Mai NN Mai
1999 CARD 1999
von APPR von
Konstantin NE Konstantin
C.W. NE C.W.
Volkmann NE Volkmann
geschrieben VVPP schreiben
und KON und
im APPRART im
Mai NN Mai
2000 CARD 2000
von APPR von
Stefan NE Stefan
Börzel NN Börzel
übernommen VVPP übernehmen
. $. .
</s>
</text>
----

Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Copyright 2019
// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
// Technische Universität Darmstadt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

[[sect_formats_inlinexml]]
= Inline XML

Tries its best to export the annotations into an inline XML representation. Overlapping annotations are not supported in this format and are silently discarded during export.

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| link:https://uima.apache.org/d/uimaj-current/apidocs/org/apache/uima/util/CasToInlineXml.html[UIMA Inline XML]
| no
| yes
| yes
|
|====

Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2019
// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
// Technische Universität Darmstadt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

[[sect_formats_lif]]
= LAPPS Interchange Format

The link:https://wiki.lappsgrid.org/interchange/[LAPPS Interchange Format] (LIF) is a JSON-based format which is used by the link:http://www.lappsgrid.org[Language Applications Grid]. The the format is in principle generic, the support for it is based on the link:http://vocab.lappsgrid.org[LAPPS Web Service Exchange Vocabulary].

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| link:https://wiki.lappsgrid.org/interchange/[LIF]
| yes
| yes
| no
|
|====

.Example
[source,text]
----
{
"id": "v2",
"metadata": {
"contains": {
"Token": {
"producer": "org.anc.lapps.stanford.SATokenizer:1.4.0",
"type": "tokenization:stanford" },
"Token#pos": {
"producer": "org.anc.lapps.stanford.SATagger:1.4.0",
"posTagSet": "penn",
"type": "postagging:stanford" }}},
"annotations": [
{ "@type": "Token", "id": "tok0", "start": 0, "end": 4, "features": { "pos": "NNP" } },
{ "@type": "Token", "id": "tok1", "start": 5, "end": 10, "features": { "pos": "VBZ" } },
{ "@type": "Token", "id": "tok2", "start": 10, "end": 11, "features": { "pos": "." } } ]
}
----
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
// Copyright 2019
// Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
// Technische Universität Darmstadt
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

[[sect_formats_nif]]
= NLP Interchange Format

The link:https://persistence.uni-leipzig.org/nlp2rdf/[NLP Interchange Format] (NIF) provides a way of representing NLP information using semantic web technology, specifically RDF and OWL. A few additions of the format were defined in the apparently in-official link:https://nif.readthedocs.io/en/latest/[NIF 2.1] specification.

[cols="2,1,1,1,3"]
|====
| Format | Read | Write | Custom Layers | Description

| link:https://persistence.uni-leipzig.org/nlp2rdf/[NIF]
| yes
| yes
| no
|
|====

.Example
[source,text]
----
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#> .
@prefix itsrdf: <http://www.w3.org/2005/11/its/rdf#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
<http://example.org/document0#char=0,86>
a nif:RFC5147String , nif:String , nif:Context ;
nif:beginIndex "0"^^xsd:nonNegativeInteger ;
nif:endIndex "86"^^xsd:nonNegativeInteger ;
nif:isString "Japan (Japanese: 日本 Nippon or Nihon) is a stratovolcanic archipelago of 6,852 islands."^^xsd:string ;
nif:topic <http://example.org/document0#annotation0> .
<http://example.org/document0#char=0,5>
a nif:RFC5147String , nif:String ;
nif:anchorOf "Japan"^^xsd:string ;
nif:beginIndex "0"^^xsd:nonNegativeInteger ;
nif:endIndex "5"^^xsd:nonNegativeInteger ;
nif:referenceContext <http://example.org/document0#char=0,86> ;
itsrdf:taClassRef <http://example.org/Country> , <http://example.org/StratovolcanicArchipelago> ;
itsrdf:taIdentRef <http://example.org/Japan> .
<http://example.org/document0#char=42,68>
a nif:RFC5147String , nif:String ;
nif:anchorOf "stratovolcanic archipelago"^^xsd:string ;
nif:beginIndex "42"^^xsd:nonNegativeInteger ;
nif:endIndex "68"^^xsd:nonNegativeInteger ;
nif:referenceContext <http://example.org/document0#char=0,86> ;
itsrdf:taClassRef <http://example.org/Archipelago> , rdfs:Class ;
itsrdf:taIdentRef <http://example.org/StratovolcanicArchipelago> .
<http://example.org/document0#annotation0>
a nif:Annotation ;
itsrdf:taIdentRef <http://example.org/Geography> .
----
Loading

0 comments on commit 01024d9

Please sign in to comment.