Skip to content

Commit

Permalink
Merge branch 'master' into feature/1239-simple-relation-recommender
Browse files Browse the repository at this point in the history
* master: (29 commits)
  No issue. Fix compilation.
  #1575 - Use fuzzy search for entity linking
  #1575 - Use fuzzy search for entity linking
  [maven-release-plugin] prepare for next development iteration
  [maven-release-plugin] prepare release inception-app-0.14.2
  #1587 - Disable fact linking support
  #1587 - Disable fact linking support
  No issue. Depend on WebAnno 4.0.0 beta 9 for next release.
  #1587 - Disable fact linking support
  #1582 - Support IMS CWB format
  #1575 - Use fuzzy search for entity linking
  #1582 - Support IMS CWB format
  #1581 - Update and modularize format documentation
  No issue. Back to WebAnno SNAPSHOT version.
  [maven-release-plugin] prepare for next development iteration
  [maven-release-plugin] prepare release inception-app-0.14.1
  No issue. Depend on WebAnno 4.0.0-beta-8 for release.
  No issue. Use placeholder mechanism of SLF4J instead of String.format.
  No issue. Adjust to upstream API change.
  #1549 - Filter entity candidates by description
  ...

% Conflicts:
%	inception-recommendation-api/src/main/java/de/tudarmstadt/ukp/inception/recommendation/api/RecommendationService.java
%	inception-recommendation/src/main/java/de/tudarmstadt/ukp/inception/recommendation/service/RecommendationServiceImpl.java
  • Loading branch information
reckart committed Jan 5, 2020
2 parents 9e6af83 + 686059e commit e8c7936
Show file tree
Hide file tree
Showing 44 changed files with 1,581 additions and 398 deletions.
4 changes: 4 additions & 0 deletions inception-app-webapp/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-xml-asl</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-io-imscwb-asl</artifactId>
</dependency>

<!-- WEBANNO DEPENDENCIES -->
<dependency>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright 2019
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.formats;

import static org.apache.uima.fit.factory.CollectionReaderFactory.createReaderDescription;

import org.apache.uima.collection.CollectionReaderDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.io.imscwb.ImsCwbReader;
import org.springframework.stereotype.Component;

import de.tudarmstadt.ukp.clarin.webanno.api.format.FormatSupport;

@Component
public class ImsCwbFormatSupport
implements FormatSupport
{
public static final String ID = "imscwb";
public static final String NAME = "Corpus Workbench Format (aka VRT)";

@Override
public String getId()
{
return ID;
}

@Override
public String getName()
{
return NAME;
}

@Override
public boolean isReadable()
{
return true;
}

// @Override
// public boolean isWritable()
// {
// return true;
// }

@Override
public CollectionReaderDescription getReaderDescription() throws ResourceInitializationException
{
return createReaderDescription(ImsCwbReader.class);
}

// @Override
// public AnalysisEngineDescription getWriterDescription(Project aProject, CAS aCAS)
// throws ResourceInitializationException
// {
// return createEngineDescription(ImsCwbWriter.class);
// }
}
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
import java.util.stream.Collectors;

import org.apache.commons.lang3.ClassUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.Validate;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.text.AnnotationFS;
Expand Down Expand Up @@ -187,8 +188,8 @@ public Set<KBHandle> generateCandidates(KnowledgeBase aKB, String aConceptScope,
iriMatches = kbService.read(aKB, conn -> iriMatchBuilder.asHandles(conn, true));
}

log.debug("Found [{}] candidates exactly matching IRI {}",
iriMatches.size(), asList(aQuery));
log.debug("Found [{}] candidates exactly matching IRI [{}]", iriMatches.size(),
aQuery);

result.addAll(iriMatches);
}
Expand All @@ -205,32 +206,38 @@ public Set<KBHandle> generateCandidates(KnowledgeBase aKB, String aConceptScope,
// set of containing matches, due to the ranking performed by the KB/FTS, we might
// not actually see the exact matches within the first N results. So we query for
// the exact matches separately to ensure we have them.
String[] exactLabels = asList(
(aQuery != null && aQuery.length() <= threshold) ? aQuery : null, aMention)
.stream()
.filter(Objects::nonNull)
// Mind, we use the query and the mention text here - of course we don't only want
// exact matches of the query but also of the mention :)
String[] exactLabels = asList(aQuery, aMention).stream()
.filter(StringUtils::isNotBlank)
.toArray(String[]::new);
exactBuilder.withLabelMatchingExactlyAnyOf(exactLabels);

exactBuilder
.retrieveLabel()
.retrieveDescription();

List<KBHandle> exactMatches;
if (aKB.isReadOnly()) {
exactMatches = kbService.listHandlesCaching(aKB, exactBuilder, true);
}
else {
exactMatches = kbService.read(aKB, conn -> exactBuilder.asHandles(conn, true));
if (exactLabels.length > 0) {
exactBuilder.withLabelMatchingExactlyAnyOf(exactLabels);

exactBuilder
.retrieveLabel()
.retrieveDescription();

List<KBHandle> exactMatches;
if (aKB.isReadOnly()) {
exactMatches = kbService.listHandlesCaching(aKB, exactBuilder, true);
}
else {
exactMatches = kbService.read(aKB, conn -> exactBuilder.asHandles(conn, true));
}


log.debug("Found [{}] candidates exactly matching {}",
exactMatches.size(), asList(exactLabels));

result.addAll(exactMatches);
}


log.debug("Found [{}] candidates exactly matching {}",
exactMatches.size(), asList(exactLabels));

result.addAll(exactMatches);

if (aQuery != null && aQuery.length() > threshold) {
// Next we also do a "starting with" search - but only if the user's query is longer than
// the threshold - this is because for short queries, we'd get way too many results which
// would be slow - and also the results would likely not be very accurate
if (aQuery != null && aQuery.trim().length() >= threshold) {
SPARQLQueryPrimaryConditions startingWithBuilder = newQueryBuilder(aValueType, aKB);

if (aConceptScope != null) {
Expand Down Expand Up @@ -261,39 +268,44 @@ public Set<KBHandle> generateCandidates(KnowledgeBase aKB, String aConceptScope,
result.addAll(startingWithMatches);
}


// Collect containing matches
SPARQLQueryPrimaryConditions containingBuilder = newQueryBuilder(aValueType, aKB);

if (aConceptScope != null) {
// Scope-limiting must always happen before label matching!
containingBuilder.descendantsOf(aConceptScope);
}

String[] containingLabels = asList(
(aQuery != null && aQuery.length() > threshold) ? aQuery : null, aMention)
.stream()
// Finally, we use the query and mention also for a "containing" search - but only if they
// are longer than the threshold. Again, for very short query/mention, we'd otherwise get
// way too many matches, being slow and not accurate.
String[] longLabels = asList(aQuery, aMention).stream()
.filter(Objects::nonNull)
.map(s -> s.trim())
.filter(s -> s.length() >= threshold)
.toArray(String[]::new);
containingBuilder.withLabelContainingAnyOf(containingLabels);

containingBuilder
.retrieveLabel()
.retrieveDescription();

List<KBHandle> containingMatches;
if (aKB.isReadOnly()) {
containingMatches = kbService.listHandlesCaching(aKB, containingBuilder, true);
}
else {
containingMatches = kbService.read(aKB,
conn -> containingBuilder.asHandles(conn, true));
if (longLabels.length > 0) {
// Collect containing matches
SPARQLQueryPrimaryConditions containingBuilder = newQueryBuilder(aValueType, aKB);

if (aConceptScope != null) {
// Scope-limiting must always happen before label matching!
containingBuilder.descendantsOf(aConceptScope);
}

containingBuilder.withLabelMatchingAnyOf(longLabels);

containingBuilder
.retrieveLabel()
.retrieveDescription();

List<KBHandle> containingMatches;
if (aKB.isReadOnly()) {
containingMatches = kbService.listHandlesCaching(aKB, containingBuilder, true);
}
else {
containingMatches = kbService.read(aKB,
conn -> containingBuilder.asHandles(conn, true));
}

log.debug("Found [{}] candidates using matching {}", containingMatches.size(),
asList(longLabels));

result.addAll(containingMatches);
}

log.debug("Found [{}] candidates using containing {}",
containingMatches.size(), asList(containingLabels));

result.addAll(containingMatches);

log.debug("Generated [{}] candidates in {}ms", result.size(),
currentTimeMillis() - startTime);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package de.tudarmstadt.ukp.inception.conceptlinking.service;

import static de.tudarmstadt.ukp.inception.kb.ConceptFeatureValueType.ANY_OBJECT;
import static java.util.Collections.emptyList;
import static org.assertj.core.api.Assertions.assertThat;

Expand All @@ -42,7 +43,6 @@
import de.tudarmstadt.ukp.clarin.webanno.model.Project;
import de.tudarmstadt.ukp.inception.conceptlinking.config.EntityLinkingProperties;
import de.tudarmstadt.ukp.inception.conceptlinking.util.TestFixtures;
import de.tudarmstadt.ukp.inception.kb.ConceptFeatureValueType;
import de.tudarmstadt.ukp.inception.kb.KnowledgeBaseService;
import de.tudarmstadt.ukp.inception.kb.KnowledgeBaseServiceImpl;
import de.tudarmstadt.ukp.inception.kb.config.KnowledgeBaseProperties;
Expand Down Expand Up @@ -95,8 +95,7 @@ public void thatLuceneSailIndexedConceptIsRetrievableWithFullTextSearch() throws
kbService.registerKnowledgeBase(kb, kbService.getNativeConfig());
importKnowledgeBase("data/pets.ttl");

List<KBHandle> handles = sut.disambiguate(kb, null,
ConceptFeatureValueType.ANY_OBJECT, null, "soc", 0, null);
List<KBHandle> handles = sut.disambiguate(kb, null, ANY_OBJECT, "soc", null, 0, null);

assertThat(handles.stream().map(KBHandle::getName))
.as("Check whether \"Socke\" has been retrieved.")
Expand All @@ -112,8 +111,7 @@ public void thatAddedLuceneSailIndexedConceptIsRetrievableWithFullTextSearch() t
KBConcept concept = new KBConcept();
concept.setName("manatee");
kbService.createConcept(kb, concept);
List<KBHandle> handles = sut.disambiguate(kb, null,
ConceptFeatureValueType.ANY_OBJECT, null, "man", 0, null);
List<KBHandle> handles = sut.disambiguate(kb, null, ANY_OBJECT, "man", null, 0, null);

assertThat(handles.stream().map(KBHandle::getName))
.as("Check whether \"manatee\" has been retrieved.")
Expand Down
48 changes: 45 additions & 3 deletions inception-doc/src/main/resources/META-INF/asciidoc/user-guide.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ include::{include-dir}annotation_activeLearning.adoc[leveloffset=+2]

include::{include-dir}annotation_concept-linking.adoc[leveloffset=+2]

include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]
// include::{include-dir}annotation_fact-extraction.adoc[leveloffset=+2]

include::{include-dir}annotation_images.adoc[leveloffset=+2]

Expand Down Expand Up @@ -192,12 +192,54 @@ include::{include-dir}pdf-editor.adoc[leveloffset=+1]
= Appendices

[appendix]
include::{include-dir}webannotsv.adoc[leveloffset=+1]
include::{include-dir}formats.adoc[leveloffset=+1]

include::{include-dir}formats-conll2000.adoc[leveloffset=+2]

include::{include-dir}formats-conll2002.adoc[leveloffset=+2]

include::{include-dir}formats-conll2003.adoc[leveloffset=+2]

include::{include-dir}formats-conll2006.adoc[leveloffset=+2]

include::{include-dir}formats-conll2009.adoc[leveloffset=+2]

include::{include-dir}formats-conll2012.adoc[leveloffset=+2]

include::{include-dir}formats-conllcorenlp.adoc[leveloffset=+2]

include::{include-dir}formats-conllu.adoc[leveloffset=+2]

include::{include-dir}formats-imscwb.adoc[leveloffset=+2]

include::{include-dir}formats-inlinexml.adoc[leveloffset=+2]

include::{include-dir}formats-lif.adoc[leveloffset=+2]

include::{include-dir}formats-nif.adoc[leveloffset=+2]

include::{include-dir}formats-perseus.adoc[leveloffset=+2]

include::{include-dir}formats-tcf.adoc[leveloffset=+2]

include::{include-dir}formats-tei.adoc[leveloffset=+2]

include::{include-dir}formats-text.adoc[leveloffset=+2]

include::{include-dir}formats-uimabinarycas.adoc[leveloffset=+2]

include::{include-dir}formats-uimaxmi.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv1.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv2.adoc[leveloffset=+2]

include::{include-dir}formats-webannotsv3.adoc[leveloffset=+2]

<<<

[appendix]
include::{include-dir}formats.adoc[leveloffset=+1]
include::{include-dir}webannotsv.adoc[leveloffset=+1]

<<<

Expand Down
Loading

0 comments on commit e8c7936

Please sign in to comment.