Skip to content

Commit

Permalink
#4209 - Improve query speed for large KBs when using RDF4J Lucene FTS
Browse files Browse the repository at this point in the history
- Set a default fuzzy query length of 3 for local KB queries
- Enable limiting the results returned internally by local KBs
  • Loading branch information
reckart committed Sep 28, 2023
1 parent fa31133 commit 205402e
Showing 1 changed file with 45 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import static org.apache.commons.lang3.reflect.FieldUtils.readField;
import static org.apache.commons.lang3.reflect.FieldUtils.writeField;
import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.iri;

import java.io.BufferedInputStream;
Expand Down Expand Up @@ -78,6 +80,7 @@
import org.eclipse.rdf4j.repository.manager.RepositoryManager;
import org.eclipse.rdf4j.repository.manager.RepositoryProvider;
import org.eclipse.rdf4j.repository.sail.SailRepository;
import org.eclipse.rdf4j.repository.sail.SailRepositoryConnection;
import org.eclipse.rdf4j.repository.sail.config.SailRepositoryConfig;
import org.eclipse.rdf4j.repository.sparql.SPARQLRepository;
import org.eclipse.rdf4j.repository.sparql.config.SPARQLRepositoryConfig;
Expand All @@ -87,6 +90,8 @@
import org.eclipse.rdf4j.rio.Rio;
import org.eclipse.rdf4j.sail.SailException;
import org.eclipse.rdf4j.sail.lucene.LuceneSail;
import org.eclipse.rdf4j.sail.lucene.LuceneSailConnection;
import org.eclipse.rdf4j.sail.lucene.impl.LuceneIndex;
import org.eclipse.rdf4j.sail.lucene.impl.config.LuceneSailConfig;
import org.eclipse.rdf4j.sail.nativerdf.config.NativeStoreConfig;
import org.eclipse.rdf4j.sparqlbuilder.constraint.propertypath.builder.PropertyPathBuilder;
Expand Down Expand Up @@ -150,6 +155,8 @@
public class KnowledgeBaseServiceImpl
implements KnowledgeBaseService, DisposableBean
{
private static final int LOCAL_FUZZY_PREFIX_LENGTH = 3;

private final Logger log = LoggerFactory.getLogger(getClass());

private @PersistenceContext EntityManager entityManager;
Expand Down Expand Up @@ -299,7 +306,7 @@ public void registerKnowledgeBase(KnowledgeBase aKB, RepositoryImplConfig aCfg)

// We want to have a separate Lucene index for every local repo, so we need to hack the
// index dir in here because this is the place where we finally know the repo ID.
setIndexDir(aKB, aCfg);
syncIndexParameters(aKB, aCfg);

repoManager.addRepositoryConfig(new RepositoryConfig(repositoryId, aCfg));
entityManager.persist(aKB);
Expand Down Expand Up @@ -492,6 +499,8 @@ public RepositoryConnection getConnection(KnowledgeBase kb)
{
{
skipCertificateChecks(kb.isSkipSslValidation());

syncIndexParameters(kb, getDelegate());
}

@Override
Expand Down Expand Up @@ -1366,25 +1375,56 @@ void reconfigureLocalKnowledgeBase(KnowledgeBase aKB)
*/

RepositoryImplConfig config = getNativeConfig();
setIndexDir(aKB, config);
syncIndexParameters(aKB, config);
repoManager.addRepositoryConfig(new RepositoryConfig(aKB.getRepositoryId(), config));
}

private void setIndexDir(KnowledgeBase aKB, RepositoryImplConfig aCfg)
private void syncIndexParameters(KnowledgeBase aKB, RepositoryImplConfig aCfg)
{
assertRegistration(aKB);

// We want to have a separate Lucene index for every local repo, so we need to hack the
// index dir in here because this is the place where we finally know the repo ID.
if (aCfg instanceof SailRepositoryConfig) {
SailRepositoryConfig cfg = (SailRepositoryConfig) aCfg;
if (cfg.getSailImplConfig() instanceof LuceneSailConfig) {
LuceneSailConfig luceneSailCfg = (LuceneSailConfig) cfg.getSailImplConfig();

// We want to have a separate Lucene index for every local repo, so we need to hack
// the index dir in here because this is the place where we finally know the repo
// ID.
luceneSailCfg.setIndexDir(
new File(kbRepositoriesRoot, "indexes/" + aKB.getRepositoryId())
.getAbsolutePath());

// Apply the FTS results limit to the KB
luceneSailCfg.setParameter(LuceneSail.MAX_DOCUMENTS_KEY,
Integer.toString(aKB.getMaxResults()));

// Improve fuzzy search speed
luceneSailCfg.setParameter(LuceneSail.FUZZY_PREFIX_LENGTH_KEY,
Integer.toString(LOCAL_FUZZY_PREFIX_LENGTH));
}
}
}

private void syncIndexParameters(KnowledgeBase kb, RepositoryConnection aConn)
{
try {
if (aConn instanceof SailRepositoryConnection) {
var sailRepo = (SailRepositoryConnection) aConn;
var sailConnection = sailRepo.getSailConnection();
if (sailConnection instanceof LuceneSailConnection) {
var luceneSailConnection = (LuceneSailConnection) sailConnection;
var luceneIndex = (LuceneIndex) readField(luceneSailConnection, "luceneIndex",
true);
writeField(luceneIndex, "maxDocs", kb.getMaxResults(), true);
writeField(luceneIndex, "fuzzyPrefixLength", LOCAL_FUZZY_PREFIX_LENGTH, true);
}
}
}
catch (Exception e) {
throw new RuntimeException("Unable to sync query parameters into live index - "
+ "maybe the RDF4J Lucene index implementation has changed.", e);
}
}

@Override
Expand Down

0 comments on commit 205402e

Please sign in to comment.