Skip to content

Commit

Permalink
#4209 - Improve query speed for large KBs when using RDF4J Lucene FTS
Browse files Browse the repository at this point in the history
- Factor Lucene FTS query out into a helper class
- Use sub-selects to impose a limit on the FTS matches
  • Loading branch information
reckart committed Sep 27, 2023
1 parent fa31133 commit 4acede1
Show file tree
Hide file tree
Showing 2 changed files with 138 additions and 50 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.inception.kb.querybuilder;

import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.function;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.str;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.REPLACE;
import static org.eclipse.rdf4j.sparqlbuilder.core.SparqlBuilder.prefix;
import static org.eclipse.rdf4j.sparqlbuilder.core.SparqlBuilder.var;
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.bNode;
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.iri;
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.literalOf;

import org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions;
import org.eclipse.rdf4j.sparqlbuilder.core.Prefix;
import org.eclipse.rdf4j.sparqlbuilder.core.Variable;
import org.eclipse.rdf4j.sparqlbuilder.graphpattern.GraphPattern;
import org.eclipse.rdf4j.sparqlbuilder.graphpattern.GraphPatterns;
import org.eclipse.rdf4j.sparqlbuilder.graphpattern.TriplePattern;
import org.eclipse.rdf4j.sparqlbuilder.rdf.Iri;

import de.tudarmstadt.ukp.inception.kb.querybuilder.backport.Bind;

public class Rdf4JFtsQuery
implements GraphPattern
{
public static final Prefix PREFIX_RDF4J_SEARCH = prefix("search",
iri("http://www.openrdf.org/contrib/lucenesail#"));
public static final Iri RDF4J_MATCHES = PREFIX_RDF4J_SEARCH.iri("matches");
public static final Iri RDF4J_QUERY = PREFIX_RDF4J_SEARCH.iri("query");
public static final Iri RDF4J = PREFIX_RDF4J_SEARCH.iri("property");
public static final Iri RDF4J_SNIPPET = PREFIX_RDF4J_SEARCH.iri("snippet");
public static final Iri LUCENE_SCORE = PREFIX_RDF4J_SEARCH.iri("score");

private final Variable subject;
private final Variable score;
private final Variable matchTerm;
private final Variable matchTermProperty;
private final String query;
private int limit = 0;
private boolean alternativeMode = false;

public Rdf4JFtsQuery(Variable aSubject, Variable aScore, Variable aMatchTerm,
Variable aMatchTermProperty, String aQuery)
{
subject = aSubject;
score = aScore;
matchTerm = aMatchTerm;
matchTermProperty = aMatchTermProperty;
query = aQuery;
}

public Rdf4JFtsQuery withLimit(int aLimit)
{
limit = aLimit;
return this;
}

public Rdf4JFtsQuery alternativeMode()
{
alternativeMode = true;
return this;
}

@Override
public String getQueryString()
{
if (alternativeMode) {
// If a KB item has multiple labels, we want to return only the ones which actually
// match the query term such that the user is not confused that the results contain
// items that don't match the query (even though they do through a label that is not
// returned). RDF4J only provides access to the matched term in a "highlighted" form
// where "<B>" and "</B>" match the search term. So we have to strip these markers
// out as part of the query.
GraphPattern pattern = subject //
.has(RDF4J_MATCHES, bNode(RDF4J_QUERY, literalOf(query)) //
.andHas(LUCENE_SCORE, score) //
.andHas(RDF4J, matchTermProperty) //
.andHas(RDF4J_SNIPPET, var("snippet")))
.and(new Bind(
function(REPLACE,
function(REPLACE, var("snippet"), literalOf("</B>"),
literalOf("")),
literalOf("<B>"), literalOf("")),
var("label")))
.and(subject.has(matchTermProperty, matchTerm)) //
.filter(Expressions.equals(str(var("label")), str(matchTermProperty)));

return GraphPatterns.select(subject, matchTerm, score).where(pattern) //
.limit(limit) //
.getQueryString();
}

TriplePattern pattern = subject.has(RDF4J_MATCHES, bNode(RDF4J_QUERY, literalOf(query)) //
.andHas(LUCENE_SCORE, score) //
.andHas(RDF4J, matchTermProperty)) //
.andHas(matchTermProperty, matchTerm);

return GraphPatterns.select(subject, matchTerm, score).where(pattern).limit(limit) //
.getQueryString();
}

@Override
public boolean isEmpty()
{
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,10 @@
import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.function;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.notEquals;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.or;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.Expressions.str;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.CONTAINS;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.LANG;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.LANGMATCHES;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.REGEX;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.REPLACE;
import static org.eclipse.rdf4j.sparqlbuilder.constraint.SparqlFunction.STRSTARTS;
import static org.eclipse.rdf4j.sparqlbuilder.core.SparqlBuilder.dataset;
import static org.eclipse.rdf4j.sparqlbuilder.core.SparqlBuilder.from;
Expand All @@ -63,6 +61,7 @@
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.literalOf;
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.literalOfLanguage;

import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
Expand Down Expand Up @@ -115,7 +114,6 @@
import de.tudarmstadt.ukp.inception.kb.graph.KBHandle;
import de.tudarmstadt.ukp.inception.kb.graph.KBObject;
import de.tudarmstadt.ukp.inception.kb.model.KnowledgeBase;
import de.tudarmstadt.ukp.inception.kb.querybuilder.backport.Bind;

/**
* Build queries against the KB.
Expand All @@ -130,7 +128,7 @@
public class SPARQLQueryBuilder
implements SPARQLQuery, SPARQLQueryPrimaryConditions, SPARQLQueryOptionalElements
{
private final static Logger LOG = LoggerFactory.getLogger(SPARQLQueryBuilder.class);
private final static Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

public static final int DEFAULT_LIMIT = 0;

Expand Down Expand Up @@ -162,14 +160,8 @@ public class SPARQLQueryBuilder

public static final Prefix PREFIX_LUCENE_SEARCH = prefix("search",
iri("http://www.openrdf.org/contrib/lucenesail#"));
public static final Iri LUCENE_QUERY = PREFIX_LUCENE_SEARCH.iri("query");
public static final Iri LUCENE_PROPERTY = PREFIX_LUCENE_SEARCH.iri("property");
public static final Iri LUCENE_SCORE = PREFIX_LUCENE_SEARCH.iri("score");
public static final Iri LUCENE_SNIPPET = PREFIX_LUCENE_SEARCH.iri("snippet");

public static final Prefix PREFIX_FUSEKI_SEARCH = prefix("text",
iri("http://jena.apache.org/text#"));
public static final Iri FUSEKI_QUERY = PREFIX_FUSEKI_SEARCH.iri("query");

public static final Prefix PREFIX_STARDOG_SEARCH = prefix("fts", iri(PREFIX_STARDOG));

Expand Down Expand Up @@ -870,12 +862,9 @@ private GraphPattern withLabelMatchingExactlyAnyOf_RDF4J_FTS(String[] aValues)
continue;
}

valuePatterns.add(VAR_SUBJECT
.has(FTS_LUCENE,
bNode(LUCENE_QUERY, literalOf(sanitizedValue)).andHas(LUCENE_PROPERTY,
VAR_MATCH_TERM_PROPERTY))
.andHas(VAR_MATCH_TERM_PROPERTY, VAR_MATCH_TERM)
.filter(equalsPattern(VAR_MATCH_TERM, value, kb)));
valuePatterns.add(new Rdf4JFtsQuery(VAR_SUBJECT, VAR_SCORE, VAR_MATCH_TERM,
VAR_MATCH_TERM_PROPERTY, sanitizedValue).withLimit(getLimit())
.filter(equalsPattern(VAR_MATCH_TERM, value, kb)));
}

return and( //
Expand Down Expand Up @@ -1163,28 +1152,9 @@ private GraphPattern withLabelMatchingAnyOf_RDF4J_FTS(String[] aValues)
continue;
}

var labelFilterExpressions = new ArrayList<Expression<?>>();
labelFilterExpressions.add(Expressions.equals(str(var("label")), str(VAR_MATCH_TERM)));
labelFilterExpressions.add(matchKbLanguage(VAR_MATCH_TERM));

// If a KB item has multiple labels, we want to return only the ones which actually
// match the query term such that the user is not confused that the results contain
// items that don't match the query (even though they do through a label that is not
// returned). RDF4J only provides access to the matched term in a "highlighed" form
// where "<B>" and "</B>" match the search term. So we have to strip these markers
// out as part of the query.
valuePatterns.add(VAR_SUBJECT //
.has(FTS_LUCENE, bNode(LUCENE_QUERY, literalOf(fuzzyQuery)) //
.andHas(LUCENE_PROPERTY, VAR_MATCH_TERM_PROPERTY)
.andHas(LUCENE_SNIPPET, var("snippet")))
.and(new Bind(
function(REPLACE,
function(REPLACE, var("snippet"), literalOf("</B>"),
literalOf("")),
literalOf("<B>"), literalOf("")),
var("label")))
.and(VAR_SUBJECT.has(VAR_MATCH_TERM_PROPERTY, VAR_MATCH_TERM))
.filter(and(labelFilterExpressions.toArray(Expression[]::new))));
valuePatterns.add(new Rdf4JFtsQuery(VAR_SUBJECT, VAR_SCORE, VAR_MATCH_TERM,
VAR_MATCH_TERM_PROPERTY, sanitizedValue).withLimit(getLimit()).alternativeMode()
.filter(matchKbLanguage(VAR_MATCH_TERM)));
}

return and( //
Expand Down Expand Up @@ -1267,12 +1237,9 @@ private GraphPattern withLabelContainingAnyOf_RDF4J_FTS(String[] aValues)
continue;
}

valuePatterns.add(VAR_SUBJECT
.has(FTS_LUCENE,
bNode(LUCENE_QUERY, literalOf(sanitizedValue + "*"))
.andHas(LUCENE_PROPERTY, VAR_MATCH_TERM_PROPERTY))
.andHas(VAR_MATCH_TERM_PROPERTY, VAR_MATCH_TERM)
.filter(containsPattern(VAR_MATCH_TERM, value)));
valuePatterns.add(new Rdf4JFtsQuery(VAR_SUBJECT, VAR_SCORE, VAR_MATCH_TERM,
VAR_MATCH_TERM_PROPERTY, sanitizedValue).withLimit(getLimit())
.filter(containsPattern(VAR_MATCH_TERM, value)));
}

return GraphPatterns.and(bindMatchTermProperties(VAR_MATCH_TERM_PROPERTY),
Expand Down Expand Up @@ -1562,14 +1529,12 @@ private GraphPattern withLabelStartingWith_RDF4J_FTS(String aPrefixQuery)

// Locate all entries where the label contains the prefix (using the FTS) and then
// filter them by those which actually start with the prefix.

return and( //
bindMatchTermProperties(VAR_MATCH_TERM_PROPERTY), //
VAR_SUBJECT
.has(FTS_LUCENE,
bNode(LUCENE_QUERY, literalOf(queryString)).andHas(LUCENE_PROPERTY,
VAR_MATCH_TERM_PROPERTY))
.andHas(VAR_MATCH_TERM_PROPERTY, VAR_MATCH_TERM)
.filter(startsWithPattern(VAR_MATCH_TERM, aPrefixQuery)));
new Rdf4JFtsQuery(VAR_SUBJECT, VAR_SCORE, VAR_MATCH_TERM, VAR_MATCH_TERM_PROPERTY,
sanitizedValue).withLimit(getLimit())
.filter(startsWithPattern(VAR_MATCH_TERM, aPrefixQuery)));
}

private GraphPattern withLabelStartingWith_Fuseki_FTS(String aPrefixQuery)
Expand Down

0 comments on commit 4acede1

Please sign in to comment.