From 205402e52057be7f92c2f26c7a9d1a1c2da03476 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Thu, 28 Sep 2023 19:13:19 +0200 Subject: [PATCH] #4209 - Improve query speed for large KBs when using RDF4J Lucene FTS - Set a default fuzzy query length of 3 for local KB queries - Enable limiting the results returned internally by local KBs --- .../kb/KnowledgeBaseServiceImpl.java | 50 +++++++++++++++++-- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/inception/inception-kb/src/main/java/de/tudarmstadt/ukp/inception/kb/KnowledgeBaseServiceImpl.java b/inception/inception-kb/src/main/java/de/tudarmstadt/ukp/inception/kb/KnowledgeBaseServiceImpl.java index 230fde33142..355a5e3871e 100644 --- a/inception/inception-kb/src/main/java/de/tudarmstadt/ukp/inception/kb/KnowledgeBaseServiceImpl.java +++ b/inception/inception-kb/src/main/java/de/tudarmstadt/ukp/inception/kb/KnowledgeBaseServiceImpl.java @@ -26,6 +26,8 @@ import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.commons.lang3.StringUtils.substringAfter; import static org.apache.commons.lang3.StringUtils.substringBefore; +import static org.apache.commons.lang3.reflect.FieldUtils.readField; +import static org.apache.commons.lang3.reflect.FieldUtils.writeField; import static org.eclipse.rdf4j.sparqlbuilder.rdf.Rdf.iri; import java.io.BufferedInputStream; @@ -78,6 +80,7 @@ import org.eclipse.rdf4j.repository.manager.RepositoryManager; import org.eclipse.rdf4j.repository.manager.RepositoryProvider; import org.eclipse.rdf4j.repository.sail.SailRepository; +import org.eclipse.rdf4j.repository.sail.SailRepositoryConnection; import org.eclipse.rdf4j.repository.sail.config.SailRepositoryConfig; import org.eclipse.rdf4j.repository.sparql.SPARQLRepository; import org.eclipse.rdf4j.repository.sparql.config.SPARQLRepositoryConfig; @@ -87,6 +90,8 @@ import org.eclipse.rdf4j.rio.Rio; import org.eclipse.rdf4j.sail.SailException; import org.eclipse.rdf4j.sail.lucene.LuceneSail; +import org.eclipse.rdf4j.sail.lucene.LuceneSailConnection; +import org.eclipse.rdf4j.sail.lucene.impl.LuceneIndex; import org.eclipse.rdf4j.sail.lucene.impl.config.LuceneSailConfig; import org.eclipse.rdf4j.sail.nativerdf.config.NativeStoreConfig; import org.eclipse.rdf4j.sparqlbuilder.constraint.propertypath.builder.PropertyPathBuilder; @@ -150,6 +155,8 @@ public class KnowledgeBaseServiceImpl implements KnowledgeBaseService, DisposableBean { + private static final int LOCAL_FUZZY_PREFIX_LENGTH = 3; + private final Logger log = LoggerFactory.getLogger(getClass()); private @PersistenceContext EntityManager entityManager; @@ -299,7 +306,7 @@ public void registerKnowledgeBase(KnowledgeBase aKB, RepositoryImplConfig aCfg) // We want to have a separate Lucene index for every local repo, so we need to hack the // index dir in here because this is the place where we finally know the repo ID. - setIndexDir(aKB, aCfg); + syncIndexParameters(aKB, aCfg); repoManager.addRepositoryConfig(new RepositoryConfig(repositoryId, aCfg)); entityManager.persist(aKB); @@ -492,6 +499,8 @@ public RepositoryConnection getConnection(KnowledgeBase kb) { { skipCertificateChecks(kb.isSkipSslValidation()); + + syncIndexParameters(kb, getDelegate()); } @Override @@ -1366,25 +1375,56 @@ void reconfigureLocalKnowledgeBase(KnowledgeBase aKB) */ RepositoryImplConfig config = getNativeConfig(); - setIndexDir(aKB, config); + syncIndexParameters(aKB, config); repoManager.addRepositoryConfig(new RepositoryConfig(aKB.getRepositoryId(), config)); } - private void setIndexDir(KnowledgeBase aKB, RepositoryImplConfig aCfg) + private void syncIndexParameters(KnowledgeBase aKB, RepositoryImplConfig aCfg) { assertRegistration(aKB); - // We want to have a separate Lucene index for every local repo, so we need to hack the - // index dir in here because this is the place where we finally know the repo ID. if (aCfg instanceof SailRepositoryConfig) { SailRepositoryConfig cfg = (SailRepositoryConfig) aCfg; if (cfg.getSailImplConfig() instanceof LuceneSailConfig) { LuceneSailConfig luceneSailCfg = (LuceneSailConfig) cfg.getSailImplConfig(); + + // We want to have a separate Lucene index for every local repo, so we need to hack + // the index dir in here because this is the place where we finally know the repo + // ID. luceneSailCfg.setIndexDir( new File(kbRepositoriesRoot, "indexes/" + aKB.getRepositoryId()) .getAbsolutePath()); + + // Apply the FTS results limit to the KB + luceneSailCfg.setParameter(LuceneSail.MAX_DOCUMENTS_KEY, + Integer.toString(aKB.getMaxResults())); + + // Improve fuzzy search speed + luceneSailCfg.setParameter(LuceneSail.FUZZY_PREFIX_LENGTH_KEY, + Integer.toString(LOCAL_FUZZY_PREFIX_LENGTH)); + } + } + } + + private void syncIndexParameters(KnowledgeBase kb, RepositoryConnection aConn) + { + try { + if (aConn instanceof SailRepositoryConnection) { + var sailRepo = (SailRepositoryConnection) aConn; + var sailConnection = sailRepo.getSailConnection(); + if (sailConnection instanceof LuceneSailConnection) { + var luceneSailConnection = (LuceneSailConnection) sailConnection; + var luceneIndex = (LuceneIndex) readField(luceneSailConnection, "luceneIndex", + true); + writeField(luceneIndex, "maxDocs", kb.getMaxResults(), true); + writeField(luceneIndex, "fuzzyPrefixLength", LOCAL_FUZZY_PREFIX_LENGTH, true); + } } } + catch (Exception e) { + throw new RuntimeException("Unable to sync query parameters into live index - " + + "maybe the RDF4J Lucene index implementation has changed.", e); + } } @Override