Skip to content

Commit

Permalink
Merge branch 'release/33.x'
Browse files Browse the repository at this point in the history
* release/33.x:
  #4860 - Knowledge base items should be matched even if query contains terms out of order
  #4869 - Cannot re-merge curation document
  #4867 - Cannot re-open document for curation if it contains an invalid feature value
  No issue: Warn about slow queries when using a scope
  • Loading branch information
reckart committed Jun 11, 2024
2 parents 5995b0c + 4d8e1d1 commit 8cde109
Show file tree
Hide file tree
Showing 11 changed files with 113 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ public class FtsScoreFeatureGenerator
@Override
public void apply(CandidateEntity aCandidate)
{
aCandidate.put(CandidateEntity.KEY_FTS_SCORE, aCandidate.getHandle().getScore());
aCandidate.put(CandidateEntity.SCORE_FTS, aCandidate.getHandle().getScore());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,20 +17,21 @@
*/
package de.tudarmstadt.ukp.inception.conceptlinking.feature;

import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_MENTION_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_MENTION_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_BEST_MATCH_TERM_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_MENTION_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_QUERY_NC;
import static org.apache.commons.lang3.StringUtils.join;

import org.apache.commons.text.similarity.LevenshteinDistance;
import org.springframework.core.annotation.Order;

import de.tudarmstadt.ukp.inception.conceptlinking.config.EntityLinkingServiceAutoConfiguration;
import de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity;
Expand All @@ -41,6 +42,7 @@
* {@link EntityLinkingServiceAutoConfiguration#levenshteinFeatureGenerator()}.
* </p>
*/
@Order(100)
public class LevenshteinFeatureGenerator
implements EntityRankingFeatureGenerator
{
Expand All @@ -60,26 +62,26 @@ private void update(CandidateEntity aCandidate, String aTerm)

aCandidate.get(KEY_MENTION_NC) //
.map(mention -> MEASURE.apply(termNC, mention)) //
.ifPresent(score -> aCandidate.mergeMin(KEY_LEVENSHTEIN_MENTION_NC, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_LEVENSHTEIN_MENTION_NC, score));

aCandidate.get(KEY_QUERY_NC) //
.map(query -> MEASURE.apply(termNC, query)) //
.ifPresent(score -> {
if (aCandidate.mergeMin(KEY_LEVENSHTEIN_QUERY_NC, score)) {
if (aCandidate.mergeMin(SCORE_LEVENSHTEIN_QUERY_NC, score)) {
aCandidate.put(KEY_QUERY_BEST_MATCH_TERM_NC, aTerm);
}
});

aCandidate.get(KEY_MENTION) //
.map(mention -> MEASURE.apply(aTerm, mention)) //
.ifPresent(score -> aCandidate.mergeMin(KEY_LEVENSHTEIN_MENTION, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_LEVENSHTEIN_MENTION, score));

aCandidate.get(KEY_QUERY) //
.map(query -> MEASURE.apply(aTerm, query)) //
.ifPresent(score -> aCandidate.mergeMin(KEY_LEVENSHTEIN_QUERY, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_LEVENSHTEIN_QUERY, score));

aCandidate.get(KEY_MENTION_CONTEXT) //
.map(context -> MEASURE.apply(aTerm, join(context, ' '))) //
.ifPresent(score -> aCandidate.mergeMin(KEY_LEVENSHTEIN_MENTION_CONTEXT, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_LEVENSHTEIN_MENTION_CONTEXT, score));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,16 @@
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_BEST_MATCH_TERM_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_BOW;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_BOW_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_MENTION_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_MENTION_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.sortedBagOfWords;
import static java.util.Arrays.copyOf;

import org.springframework.core.annotation.Order;

import de.tudarmstadt.ukp.inception.conceptlinking.config.EntityLinkingServiceAutoConfiguration;
import de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity;

Expand All @@ -40,15 +42,15 @@
* {@link EntityLinkingServiceAutoConfiguration#matchingTokenOverlapFeatureGenerator}.
* </p>
*/
@Order(200) // Make sure QUERY_BEST_MATCH_TERM from Levenshtein is overwritten
public class MatchingTokenOverlapFeatureGenerator
implements EntityRankingFeatureGenerator
{

@Override
public void apply(CandidateEntity aCandidate)
{
var label = aCandidate.getLabel();
update(aCandidate, label);
update(aCandidate, aCandidate.getLabel());
aCandidate.getHandle().getMatchTerms().forEach(p -> update(aCandidate, p.getKey()));
}

Expand All @@ -60,31 +62,32 @@ private void update(CandidateEntity aCandidate, String aTerm)
aCandidate.get(KEY_MENTION_BOW_NC) //
.map(mention -> distance(tokensNC, mention)) //
.filter(score -> score >= 0) //
.ifPresent(score -> aCandidate.mergeMin(KEY_TOKEN_OVERLAP_MENTION_NC, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_TOKEN_OVERLAP_MENTION_NC, score));

aCandidate.get(KEY_QUERY_BOW_NC) //
.map(query -> distance(tokensNC, query)) //
.filter(score -> score >= 0) //
.ifPresent(score -> {
if (aCandidate.mergeMin(KEY_TOKEN_OVERLAP_QUERY_NC, score)) {
if (aCandidate.mergeMin(SCORE_TOKEN_OVERLAP_QUERY_NC, score)) {
aCandidate.put(KEY_QUERY_BEST_MATCH_TERM_NC, aTerm);
}
});

aCandidate.get(KEY_MENTION_BOW) //
.map(mention -> distance(tokens, mention)) //
.filter(score -> score >= 0) //
.ifPresent(score -> aCandidate.mergeMin(KEY_TOKEN_OVERLAP_MENTION, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_TOKEN_OVERLAP_MENTION, score));

aCandidate.get(KEY_QUERY_BOW) //
.map(query -> distance(tokens, query)) //
.filter(score -> score >= 0) //
.ifPresent(score -> aCandidate.mergeMin(KEY_TOKEN_OVERLAP_QUERY, score));
.ifPresent(score -> aCandidate.mergeMin(SCORE_TOKEN_OVERLAP_QUERY, score));

aCandidate.get(KEY_MENTION_CONTEXT) //
.map(context -> distance(tokens, context.toArray(String[]::new))) //
.filter(score -> score >= 0) //
.ifPresent(score -> aCandidate.mergeMin(KEY_TOKEN_OVERLAP_MENTION_CONTEXT, score));
.ifPresent(
score -> aCandidate.mergeMin(SCORE_TOKEN_OVERLAP_MENTION_CONTEXT, score));
}

private int distance(String[] aSortedBowCandidate, String[] aSortedBowUser)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_NUM_RELATIONS;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_SIGNATURE_OVERLAP;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_SIGNATURE_OVERLAP_SCORE;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_SIGNATURE_OVERLAP;

import java.io.File;
import java.util.Arrays;
Expand Down Expand Up @@ -138,7 +138,7 @@ public void apply(CandidateEntity aCandidate)
}

aCandidate.put(KEY_SIGNATURE_OVERLAP, signatureOverlap);
aCandidate.put(KEY_SIGNATURE_OVERLAP_SCORE, signatureOverlap.size());
aCandidate.put(SCORE_SIGNATURE_OVERLAP, signatureOverlap.size());
aCandidate.put(KEY_NUM_RELATIONS,
(sig.getRelatedRelations() != null) ? sig.getRelatedRelations().size() : 0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
*/
public class CandidateEntity
{
public static final Pattern TOKENKIZER_PATTERN = Pattern.compile("\\s+");
public static final Pattern TOKENKIZER_PATTERN = Pattern.compile("[\\s()\\-]+");

public static String[] sortedBagOfWords(String aString)
{
Expand Down Expand Up @@ -94,15 +94,15 @@ public static String[] sortedBagOfWords(String aString)
* the default value to ensure that candidates are ranked last on this feature if it could not
* be calculated.
*/
public static final Key<Integer> KEY_LEVENSHTEIN_MENTION = new Key<>("levMention", MAX_VALUE);
public static final Key<Integer> SCORE_LEVENSHTEIN_MENTION = new Key<>("levMention", MAX_VALUE);

public static final Key<Integer> KEY_LEVENSHTEIN_MENTION_NC = new Key<>("levMentionNc",
public static final Key<Integer> SCORE_LEVENSHTEIN_MENTION_NC = new Key<>("levMentionNc",
MAX_VALUE);

public static final Key<Integer> KEY_TOKEN_OVERLAP_MENTION = new Key<>("tokenOverlapMention",
public static final Key<Integer> SCORE_TOKEN_OVERLAP_MENTION = new Key<>("tokenOverlapMention",
MAX_VALUE);

public static final Key<Integer> KEY_TOKEN_OVERLAP_MENTION_NC = new Key<>(
public static final Key<Integer> SCORE_TOKEN_OVERLAP_MENTION_NC = new Key<>(
"tokenOverlapMentionNc", MAX_VALUE);

/**
Expand All @@ -112,10 +112,10 @@ public static String[] sortedBagOfWords(String aString)
* the default value to ensure that candidates are ranked last on this feature if it could not
* be calculated.
*/
public static final Key<Integer> KEY_LEVENSHTEIN_MENTION_CONTEXT = new Key<>("levContext",
public static final Key<Integer> SCORE_LEVENSHTEIN_MENTION_CONTEXT = new Key<>("levContext",
MAX_VALUE);

public static final Key<Integer> KEY_TOKEN_OVERLAP_MENTION_CONTEXT = new Key<>(
public static final Key<Integer> SCORE_TOKEN_OVERLAP_MENTION_CONTEXT = new Key<>(
"tokenOverlapContext", MAX_VALUE);

/**
Expand All @@ -125,14 +125,15 @@ public static String[] sortedBagOfWords(String aString)
* the default value to ensure that candidates are ranked last on this feature if it could not
* be calculated.
*/
public static final Key<Integer> KEY_LEVENSHTEIN_QUERY = new Key<>("levQuery", MAX_VALUE);
public static final Key<Integer> SCORE_LEVENSHTEIN_QUERY = new Key<>("levQuery", MAX_VALUE);

public static final Key<Integer> KEY_LEVENSHTEIN_QUERY_NC = new Key<>("levQueryNc", MAX_VALUE);
public static final Key<Integer> SCORE_LEVENSHTEIN_QUERY_NC = new Key<>("levQueryNc",
MAX_VALUE);

public static final Key<Integer> KEY_TOKEN_OVERLAP_QUERY = new Key<>("tokenOverlapQuery",
public static final Key<Integer> SCORE_TOKEN_OVERLAP_QUERY = new Key<>("tokenOverlapQuery",
MAX_VALUE);

public static final Key<Integer> KEY_TOKEN_OVERLAP_QUERY_NC = new Key<>("tokenOverlapQueryNc",
public static final Key<Integer> SCORE_TOKEN_OVERLAP_QUERY_NC = new Key<>("tokenOverlapQueryNc",
MAX_VALUE);

/**
Expand All @@ -150,7 +151,7 @@ public static String[] sortedBagOfWords(String aString)
* number of related entities whose entity label occurs in <i>content tokens</i> <i>Content
* tokens</i> consist of tokens in mention sentence annotated as nouns, verbs or adjectives
*/
public static final Key<Integer> KEY_SIGNATURE_OVERLAP_SCORE = new Key<>(
public static final Key<Integer> SCORE_SIGNATURE_OVERLAP = new Key<>(
"signatureOverlapScore", 0);

/**
Expand All @@ -161,7 +162,7 @@ public static String[] sortedBagOfWords(String aString)
/**
* FTS score - score assigned by the KB FTS (if any)
*/
public static final Key<Double> KEY_FTS_SCORE = new Key<>("ftsScore", 0.0d);
public static final Key<Double> SCORE_FTS = new Key<>("ftsScore", 0.0d);

/**
* in-link count of wikipedia article of IRI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,15 @@

import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_FREQUENCY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_ID_RANK;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LABEL_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_LEVENSHTEIN_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_NUM_RELATIONS;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_IS_LOWER_CASE;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_SIGNATURE_OVERLAP_SCORE;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.KEY_TOKEN_OVERLAP_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_MENTION;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_MENTION_CONTEXT;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_QUERY;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_LEVENSHTEIN_QUERY_NC;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_SIGNATURE_OVERLAP;
import static de.tudarmstadt.ukp.inception.conceptlinking.model.CandidateEntity.SCORE_TOKEN_OVERLAP_QUERY_NC;

import java.util.Comparator;

Expand All @@ -45,13 +43,9 @@ public class BaselineRankingStrategy
// a 0 while a mismatch is represented using 1. The typical case is that neither
// candidate matches the query which causes the next ranking criteria to be evaluated
.append(queryMatchesIri(e1), queryMatchesIri(e2))
// Use FTS score if available
// .append(e2.get(KEY_FTS_SCORE).get(), e1.get(KEY_FTS_SCORE).get())
// Require token overlap
.append(e1.get(KEY_TOKEN_OVERLAP_QUERY_NC).get(),
e2.get(KEY_TOKEN_OVERLAP_QUERY_NC).get())
// Prefer matches where the query appears in the label
.append(labelMatchesQueryNC(e1), labelMatchesQueryNC(e2))
.append(e1.get(SCORE_TOKEN_OVERLAP_QUERY_NC).get(),
e2.get(SCORE_TOKEN_OVERLAP_QUERY_NC).get())
// Compare geometric mean of the Levenshtein distance to query and mention
// since both are important and a very close similarity in say the mention outweighs
// a not so close similarity in the query
Expand All @@ -61,11 +55,10 @@ public class BaselineRankingStrategy
// Cased over caseless
.append(casedOverCaseless(e1), casedOverCaseless(e2))
// A high signature overlap score is preferred.
.append(e2.get(KEY_SIGNATURE_OVERLAP_SCORE).get(),
e1.get(KEY_SIGNATURE_OVERLAP_SCORE).get())
.append(e2.get(SCORE_SIGNATURE_OVERLAP).get(), e1.get(SCORE_SIGNATURE_OVERLAP).get())
// A low edit distance is preferred.
.append(e1.get(KEY_LEVENSHTEIN_MENTION_CONTEXT).get(),
e2.get(KEY_LEVENSHTEIN_MENTION_CONTEXT).get())
.append(e1.get(SCORE_LEVENSHTEIN_MENTION_CONTEXT).get(),
e2.get(SCORE_LEVENSHTEIN_MENTION_CONTEXT).get())
// A high entity frequency is preferred.
.append(e2.get(KEY_FREQUENCY).get(), e1.get(KEY_FREQUENCY).get())
// A high number of related relations is preferred.
Expand All @@ -82,17 +75,10 @@ private static double queryMatchesIri(CandidateEntity aCandidate)
return aCandidate.get(KEY_QUERY).map(q -> q.equals(aCandidate.getIRI()) ? 0 : 1).orElse(1);
}

private static double labelMatchesQueryNC(CandidateEntity aCandidate)
{
return aCandidate.get(KEY_QUERY_NC)
.map(q -> aCandidate.get(KEY_LABEL_NC).map(l -> l.contains(q) ? 0 : 1).orElse(1))
.orElse(1);
}

private static double casedOverCaseless(CandidateEntity aCandidate)
{
int queryNC = aCandidate.get(KEY_LEVENSHTEIN_QUERY_NC).get();
int query = aCandidate.get(KEY_LEVENSHTEIN_QUERY).get();
int queryNC = aCandidate.get(SCORE_LEVENSHTEIN_QUERY_NC).get();
int query = aCandidate.get(SCORE_LEVENSHTEIN_QUERY).get();

return queryNC <= query ? 0 : 1;
}
Expand All @@ -101,8 +87,8 @@ private static double queryOverMention(CandidateEntity aCandidate)
{
boolean caseInsensitive = aCandidate.get(KEY_QUERY_IS_LOWER_CASE).orElse(true);
int query = aCandidate
.get(caseInsensitive ? KEY_LEVENSHTEIN_QUERY_NC : KEY_LEVENSHTEIN_QUERY).get();
int mention = aCandidate.get(KEY_LEVENSHTEIN_MENTION).get();
.get(caseInsensitive ? SCORE_LEVENSHTEIN_QUERY_NC : SCORE_LEVENSHTEIN_QUERY).get();
int mention = aCandidate.get(SCORE_LEVENSHTEIN_MENTION).get();

return query <= mention ? 0 : 1;
}
Expand All @@ -112,8 +98,8 @@ private static double weightedLevenshteinDistance(CandidateEntity aCandidate)
boolean caseInsensitive = aCandidate.get(KEY_QUERY_IS_LOWER_CASE).orElse(true);

int query = aCandidate
.get(caseInsensitive ? KEY_LEVENSHTEIN_QUERY_NC : KEY_LEVENSHTEIN_QUERY).get();
int mention = aCandidate.get(KEY_LEVENSHTEIN_MENTION).get();
.get(caseInsensitive ? SCORE_LEVENSHTEIN_QUERY_NC : SCORE_LEVENSHTEIN_QUERY).get();
int mention = aCandidate.get(SCORE_LEVENSHTEIN_MENTION).get();

if (query == Integer.MAX_VALUE && mention == Integer.MAX_VALUE) {
return Double.MAX_VALUE;
Expand Down
Loading

0 comments on commit 8cde109

Please sign in to comment.