From 8f9b13a4066f7cf4fb4856e45d15ef8d442a174d Mon Sep 17 00:00:00 2001 From: NickG-1 <116678373+NickG-1@users.noreply.github.com> Date: Thu, 18 Jan 2024 15:17:06 +0100 Subject: [PATCH] Completely refactor the fulltext operations (#1093) As of this commit, the fulltext index (triggered by `ql:contains-word` and `ql:contains-entity`) uses two basic operations: 1. `TextIndexScanForWord`: For a given word or prefix, return all text records that contain the word, (possibly together with the matched word in the case of a prefix, and the score of the match). 2. `TextIndexScanForEntity`: For a given word or prefix, return a superset of all pairs of `(text, entity)` where the entity is contained in the text according to `ql:contains-entity` and the text contains the `word`. For technical reasons this is a superset: We always have to scan the complete block from the half-inverted index which might belong to a shorter prefix. The general processing is then as follows: * For each word or prefix that appears as part of the object of a `ql:contains-word` triple, a `TextIndexScanForWord` is created. * For each entity or variable that appears as the object of a `ql:contains-entity` triple, a `TextIndexScanForEntity` is created. * The rest of the query processing is handled by the "ordinary" query planner using the normal operations like JOIN that are also used to process standard SPARQL queries. This is much cleaner than the old `TextOperationWith[out]Filter` operations which combined the functionality of the above scan operations with JOIN operations, because the old approach lead to a lot of code duplication (the code for a join of two tables was duplicated for the fulltext module) and because the new approach makes queries easier to optimize and to reason about because the runtime information trees become much clearer if the scans and joins are represented separately. --- e2e/scientists_queries.yaml | 106 ++-- src/engine/CMakeLists.txt | 2 +- src/engine/QueryExecutionTree.cpp | 10 + src/engine/QueryExecutionTree.h | 2 + src/engine/QueryPlanner.cpp | 508 ++++++++-------- src/engine/QueryPlanner.h | 92 +-- src/engine/TextIndexScanForEntity.cpp | 110 ++++ src/engine/TextIndexScanForEntity.h | 111 ++++ src/engine/TextIndexScanForWord.cpp | 82 +++ src/engine/TextIndexScanForWord.h | 60 ++ src/global/Constants.h | 2 + src/index/FTSAlgorithms.cpp | 42 +- src/index/FTSAlgorithms.h | 7 +- src/index/Index.cpp | 29 +- src/index/Index.h | 15 +- src/index/IndexImpl.Text.cpp | 134 ++++- src/index/IndexImpl.h | 50 +- src/index/Vocabulary.cpp | 15 +- src/index/Vocabulary.h | 6 +- src/parser/data/Variable.h | 10 + .../data/VariableToColumnMapPrinters.cpp | 26 +- .../sparqlParser/SparqlQleverVisitor.cpp | 57 +- test/FTSAlgorithmsTest.cpp | 8 +- test/IndexTestHelpers.h | 6 +- test/QueryPlannerTest.cpp | 553 +++++------------- test/QueryPlannerTestHelpers.h | 37 ++ test/SparqlParserTest.cpp | 8 +- test/VocabularyTest.cpp | 32 +- test/engine/CMakeLists.txt | 2 + test/engine/TextIndexScanForEntityTest.cpp | 155 +++++ test/engine/TextIndexScanForWordTest.cpp | 127 ++++ test/engine/TextIndexScanTestHelpers.h | 43 ++ test/util/IndexTestHelpers.cpp | 40 +- 33 files changed, 1624 insertions(+), 863 deletions(-) create mode 100644 src/engine/TextIndexScanForEntity.cpp create mode 100644 src/engine/TextIndexScanForEntity.h create mode 100644 src/engine/TextIndexScanForWord.cpp create mode 100644 src/engine/TextIndexScanForWord.h create mode 100644 test/engine/TextIndexScanForEntityTest.cpp create mode 100644 test/engine/TextIndexScanForWordTest.cpp create mode 100644 test/engine/TextIndexScanTestHelpers.h diff --git a/e2e/scientists_queries.yaml b/e2e/scientists_queries.yaml index 0b6dc1418c..29ee2481fc 100644 --- a/e2e/scientists_queries.yaml +++ b/e2e/scientists_queries.yaml @@ -5,16 +5,16 @@ queries: - query: relativ-star-scientists type: text sparql: | - SELECT ?x ?t ?ql_textscore_t WHERE { + SELECT ?x ?t ?ql_score_t_var_x WHERE { ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "relati*" } - ORDER BY DESC(?ql_textscore_t) + ORDER BY DESC(?ql_score_t_var_x) checks: - num_cols: 3 - num_rows: 4285 - - selected: [ "?x", "?t", "?ql_textscore_t"] + - selected: [ "?x", "?t", "?ql_score_t_var_x"] - contains_row: - "" - "He realized, however, that the principle of relativity could also be extended @@ -23,30 +23,27 @@ queries: - null - contains_row: [ "", null, null ] # null cells are ignored - contains_row: [ "", null, null ] # Test Unicode - - order_numeric: {"dir" : "DESC", "var": "?ql_textscore_t"} + - order_numeric: {"dir" : "DESC", "var": "?ql_score_t_var_x"} - - query: relativ-star-scientists-from-ulm # should use TextOperationWithFilter + - query: relativ-star-scientists-from-ulm type: text sparql: | - SELECT ?x ?t ?ql_textscore_t WHERE { + SELECT ?x ?t WHERE { ?x . ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "relati*" } - ORDER BY DESC(?ql_textscore_t) - TEXTLIMIT 1 checks: - - num_cols: 3 - - num_rows: 1 - - selected: [ "?x", "?t", "?ql_textscore_t" ] + - num_cols: 2 + - num_rows: 172 + - selected: [ "?x", "?t"] - contains_row: - "" - "He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity." - - null - query: relat-star-Physikalische-real-star-scientists-from-ulm type: text @@ -55,11 +52,11 @@ queries: ?x . ?x . ?t ql:contains-entity ?x . - ?t ql:contains-word "relat* Physikalische rela*" + ?t ql:contains-word "RElaT* phySIKalische rela*" } checks: - num_cols: 5 - - selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ] + - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ] - contains_row: - "" - null @@ -88,26 +85,26 @@ queries: - query: algo-star-female-scientists type: text sparql: | - SELECT ?x ?ql_textscore_t WHERE { + SELECT ?x ?ql_score_t_var_x WHERE { ?x . ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "algo*" } - ORDER BY DESC(?ql_textscore_t) + ORDER BY DESC(?ql_score_t_var_x) checks: - num_cols: 2 - num_rows: 27 - - selected: [ "?x", "?ql_textscore_t" ] + - selected: [ "?x", "?ql_score_t_var_x" ] - contains_row: [ "", null ] - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} + - order_numeric: {"dir": "DESC", "var" : "?ql_score_t_var_x"} - - query: algor-start-female-born-before-1940 + - query: algor-star-female-born-before-1940 type: text sparql: | PREFIX xsd: - SELECT ?x ?date ?t ?ql_textscore_t ?ql_matchingword_t_algor WHERE { + SELECT ?x ?date ?t ?ql_matchingword_t_algor WHERE { ?x . ?x ?date . ?x . @@ -115,29 +112,75 @@ queries: ?t ql:contains-word "algor*" . FILTER (?date < "1940-01-01"^^xsd:date) } - ORDER BY DESC(?ql_textscore_t) checks: - - num_cols: 5 + - num_cols: 4 - num_rows: 4 - contains_row: - "" - "1901-03-02" - "Hermann's algorithm for primary decomposition is still in use now." - - null - "algorithm" - contains_row: - "" - "1815-12-10" - "Her notes on the engine include what is recognised as the first algorithm intended to be carried out by a machine." - - null - "algorithm" - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} - - query: algorithm-hermann-start-female-born-before-1940 + - query: algor-star-female-fixedEntity-ada-ordered + type: text + sparql: | + SELECT * WHERE { + ?scientist . + ?scientist . + ?text ql:contains-entity ?scientist . + ?text ql:contains-entity . + ?text ql:contains-word "rela*" . + } + ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_) + checks: + - num_cols: 5 + - num_rows: 7 + - contains_row: + - "" + - null + - "As a teenager, her mathematical talents led her to an ongoing + working relationship and friendship with fellow British mathematician + Charles Babbage, also known as' the father of computers', and in + particular, Babbage's work on the Analytical Engine." + - null + - "relationship" + - order_numeric: {"dir": "DESC", + "var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"} + + - query: algor-star-female-fixedEntity-ada-fixed-Entity-mary + type: text + sparql: | + SELECT * WHERE { + ?scientist . + ?scientist . + ?text ql:contains-entity ?scientist . + ?text ql:contains-entity . + ?text ql:contains-entity . + ?text ql:contains-word "rela*" . + } + checks: + - num_cols: 6 + - num_rows: 2 + - contains_row: + - "" + - null + - "She became fascinated with the machine and used her relationship + with Somerville to visit Babbage as often as she could." + - null + - null + - "relationship" + + + - query: algorithm-hermann-star-female-born-before-1940 type: text sparql: | PREFIX xsd: - SELECT ?x ?date ?t ?ql_textscore_t WHERE { + SELECT ?x ?date ?t WHERE { ?x . ?x ?date . ?x . @@ -145,16 +188,13 @@ queries: ?t ql:contains-word "algorithm hermann" . FILTER (?date < "1940-01-01"^^xsd:date) } - ORDER BY DESC(?ql_textscore_t) checks: - - num_cols: 4 + - num_cols: 3 - num_rows: 1 - contains_row: - "" - "1901-03-02" - "Hermann's algorithm for primary decomposition is still in use now." - - null - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} - query: people-born-in-1901 type: no-text @@ -1239,11 +1279,11 @@ queries: ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "algo* herm* primary" - } TEXTLIMIT 1 + } checks: - num_cols: 5 - num_rows: 1 - - selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ] + - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ] - contains_row: [ "",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ] diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 8e37590511..b68c739d24 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -10,6 +10,6 @@ add_library(engine Union.cpp MultiColumnJoin.cpp TransitivePath.cpp Service.cpp Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp - CartesianProductJoin.cpp + CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp idTable/CompressedExternalIdTable.h) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams) diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index b7a13a6af3..b372a76bbf 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -28,6 +28,8 @@ #include "engine/OrderBy.h" #include "engine/Service.h" #include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" #include "engine/TextOperationWithFilter.h" #include "engine/TextOperationWithoutFilter.h" #include "engine/TransitivePath.h" @@ -176,6 +178,10 @@ void QueryExecutionTree::setOperation(std::shared_ptr operation) { type_ = TEXT_WITH_FILTER; } else if constexpr (std::is_same_v) { type_ = TEXT_WITHOUT_FILTER; + } else if constexpr (std::is_same_v) { + type_ = TEXT_INDEX_SCAN_FOR_WORD; + } else if constexpr (std::is_same_v) { + type_ = TEXT_INDEX_SCAN_FOR_ENTITY; } else if constexpr (std::is_same_v) { type_ = COUNT_AVAILABLE_PREDICATES; } else if constexpr (std::is_same_v) { @@ -217,6 +223,10 @@ template void QueryExecutionTree::setOperation( std::shared_ptr); template void QueryExecutionTree::setOperation( std::shared_ptr); +template void QueryExecutionTree::setOperation( + std::shared_ptr); +template void QueryExecutionTree::setOperation( + std::shared_ptr); template void QueryExecutionTree::setOperation( std::shared_ptr); template void QueryExecutionTree::setOperation(std::shared_ptr); diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 8a533ce91c..f612cbf32b 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -45,6 +45,8 @@ class QueryExecutionTree { DISTINCT, TEXT_WITHOUT_FILTER, TEXT_WITH_FILTER, + TEXT_INDEX_SCAN_FOR_WORD, + TEXT_INDEX_SCAN_FOR_ENTITY, OPTIONAL_JOIN, COUNT_AVAILABLE_PREDICATES, GROUP_BY, diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 4fb1b66c5c..20d2267329 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -4,35 +4,38 @@ // 2015-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "engine/QueryPlanner.h" #include #include +#include "engine/Bind.h" +#include "engine/CartesianProductJoin.h" +#include "engine/CheckUsePatternTrick.h" +#include "engine/CountAvailablePredicates.h" +#include "engine/Distinct.h" +#include "engine/Filter.h" +#include "engine/GroupBy.h" +#include "engine/HasPredicateScan.h" +#include "engine/IndexScan.h" +#include "engine/Join.h" +#include "engine/Minus.h" +#include "engine/MultiColumnJoin.h" +#include "engine/NeutralElementOperation.h" +#include "engine/OptionalJoin.h" +#include "engine/OrderBy.h" +#include "engine/Service.h" +#include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" +#include "engine/TextOperationWithFilter.h" +#include "engine/TextOperationWithoutFilter.h" +#include "engine/TransitivePath.h" +#include "engine/Union.h" +#include "engine/Values.h" +#include "parser/Alias.h" +#include "parser/SparqlParserHelpers.h" + namespace p = parsedQuery; namespace { @@ -188,9 +191,6 @@ std::vector QueryPlanner::optimize( auto optimizeCommutativ = [this](const auto& triples, const auto& plans, const auto& filters) { auto tg = createTripleGraph(&triples); - LOG(TRACE) << "Collapse text cliques..." << std::endl; - tg.collapseTextCliques(); - LOG(TRACE) << "Collapse text cliques done." << std::endl; // always apply all filters to be safe. // TODO it could be possible, to allow the DpTab to leave // results unfiltered and add the filters later, but this has to be @@ -464,9 +464,6 @@ std::vector QueryPlanner::optimize( // joinCandidates lambda; if (candidatePlans.size() > 1 || !candidateTriples._triples.empty()) { auto tg = createTripleGraph(&candidateTriples); - LOG(TRACE) << "Collapse text cliques..." << std::endl; - tg.collapseTextCliques(); - LOG(TRACE) << "Collapse text cliques done." << std::endl; auto lastRow = fillDpTab(tg, rootPattern->_filters, candidatePlans).back(); candidateTriples._triples.clear(); candidatePlans.clear(); @@ -646,36 +643,184 @@ vector QueryPlanner::getOrderByRow( return added; } +void QueryPlanner::addNodeToTripleGraph(const TripleGraph::Node& node, + QueryPlanner::TripleGraph& tg) const { + // TODO This needs quite some refactoring: The IDs of the nodes have + // to be ascending as an invariant, so we can store all the nodes in a + // vector or even a plain vector. + tg._nodeStorage.emplace_back(node); + auto& addedNode = tg._nodeStorage.back(); + tg._nodeMap[addedNode.id_] = &addedNode; + tg._adjLists.emplace_back(); + AD_CORRECTNESS_CHECK(tg._adjLists.size() == tg._nodeStorage.size()); + AD_CORRECTNESS_CHECK(tg._adjLists.size() == addedNode.id_ + 1); + // Now add an edge between the added node and every node sharing a var. + for (auto& addedNodevar : addedNode._variables) { + for (size_t i = 0; i < addedNode.id_; ++i) { + auto& otherNode = *tg._nodeMap[i]; + if (otherNode._variables.contains(addedNodevar)) { + // There is an edge between *it->second and the node with id "id". + tg._adjLists[addedNode.id_].push_back(otherNode.id_); + tg._adjLists[otherNode.id_].push_back(addedNode.id_); + } + } + } +} + // _____________________________________________________________________________ QueryPlanner::TripleGraph QueryPlanner::createTripleGraph( const p::BasicGraphPattern* pattern) const { TripleGraph tg; - if (pattern->_triples.size() > 64) { - AD_THROW("At most 64 triples allowed at the moment."); - } + size_t numNodesInTripleGraph = 0; + ad_utility::HashMap optTermForCvar; + ad_utility::HashMap> potentialTermsForCvar; + vector entityTriples; + // Add one or more nodes for each triple. for (auto& t : pattern->_triples) { - // Add a node for the triple. - tg._nodeStorage.emplace_back(TripleGraph::Node(tg._nodeStorage.size(), t)); - auto& addedNode = tg._nodeStorage.back(); - tg._nodeMap[addedNode._id] = &tg._nodeStorage.back(); - tg._adjLists.emplace_back(vector()); - assert(tg._adjLists.size() == tg._nodeStorage.size()); - assert(tg._adjLists.size() == addedNode._id + 1); - // Now add an edge between the added node and every node sharing a var. - for (auto& addedNodevar : addedNode._variables) { - for (size_t i = 0; i < addedNode._id; ++i) { - auto& otherNode = *tg._nodeMap[i]; - if (otherNode._variables.count(addedNodevar) > 0) { - // There is an edge between *it->second and the node with id "id". - tg._adjLists[addedNode._id].push_back(otherNode._id); - tg._adjLists[otherNode._id].push_back(addedNode._id); - } + if (t._p._iri == CONTAINS_WORD_PREDICATE) { + std::string buffer = t._o.toString(); + std::string_view sv{buffer}; + // Add one node for each word + for (const auto& term : + absl::StrSplit(sv.substr(1, sv.size() - 2), ' ')) { + std::string s{ad_utility::utf8ToLower(term)}; + potentialTermsForCvar[t._s.getVariable()].push_back(s); + addNodeToTripleGraph( + TripleGraph::Node(tg._nodeStorage.size(), t._s.getVariable(), s, t), + tg); + numNodesInTripleGraph++; } + } else if (t._p._iri == CONTAINS_ENTITY_PREDICATE) { + entityTriples.push_back(&t); + } else { + addNodeToTripleGraph(TripleGraph::Node(tg._nodeStorage.size(), t), tg); + numNodesInTripleGraph++; } } + for (const auto& [cvar, terms] : potentialTermsForCvar) { + optTermForCvar[cvar] = + terms[_qec->getIndex().getIndexOfBestSuitedElTerm(terms)]; + } + for (const SparqlTriple* t : entityTriples) { + Variable currentVar = t->_s.getVariable(); + if (!optTermForCvar.contains(currentVar)) { + AD_THROW( + "Missing ql:contains-word statement. A ql:contains-entity " + "statement always also needs corresponding ql:contains-word " + "statement."); + } + addNodeToTripleGraph(TripleGraph::Node(tg._nodeStorage.size(), currentVar, + optTermForCvar[currentVar], *t), + tg); + numNodesInTripleGraph++; + } + if (numNodesInTripleGraph > 64) { + AD_THROW("At most 64 triples allowed at the moment."); + } return tg; } +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanSingleVarCase( + const TripleGraph::Node& node, const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan) { + using enum Permutation::Enum; + + // TODO: The case where the same variable appears in subject + predicate or + // object + predicate is missing here and leads to an assertion failure. + if (isVariable(node.triple_._s) && isVariable(node.triple_._o) && + node.triple_._s == node.triple_._o) { + if (isVariable(node.triple_._p._iri)) { + AD_THROW("Triple with one variable repeated three times"); + } + LOG(DEBUG) << "Subject variable same as object variable" << std::endl; + // Need to handle this as IndexScan with a new unique + // variable + Filter. Works in both directions + Variable filterVar = generateUniqueVarName(); + auto scanTriple = node.triple_; + scanTriple._o = filterVar; + auto scanTree = makeExecutionTree(_qec, PSO, scanTriple); + // The simplest way to set up the filtering expression is to use the + // parser. + std::string filterString = + absl::StrCat("FILTER (", scanTriple._s.getVariable().name(), "=", + filterVar.name(), ")"); + auto filter = sparqlParserHelpers::ParserAndVisitor{filterString} + .parseTypesafe(&SparqlAutomaticParser::filterR) + .resultOfParse_; + auto plan = + makeSubtreePlan(_qec, scanTree, std::move(filter.expression_)); + pushPlan(std::move(plan)); + } else if (isVariable(node.triple_._s)) { + addIndexScan(POS); + } else if (isVariable(node.triple_._o)) { + addIndexScan(PSO); + } else { + AD_CONTRACT_CHECK(isVariable(node.triple_._p)); + addIndexScan(SOP); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanTwoVarsCase( + const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const { + using enum Permutation::Enum; + + // TODO: The case that the same variable appears in more than one position + // leads (as in indexScanSingleVarCase) to an assertion. + if (!isVariable(node.triple_._p._iri)) { + addIndexScan(PSO); + addIndexScan(POS); + } else if (!isVariable(node.triple_._s)) { + addIndexScan(SPO); + addIndexScan(SOP); + } else if (!isVariable(node.triple_._o)) { + addIndexScan(OSP); + addIndexScan(OPS); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanThreeVarsCase( + const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const { + using enum Permutation::Enum; + + if (!_qec || _qec->getIndex().hasAllPermutations()) { + // Add plans for all six permutations. + addIndexScan(OPS); + addIndexScan(OSP); + addIndexScan(PSO); + addIndexScan(POS); + addIndexScan(SPO); + addIndexScan(SOP); + } else { + AD_THROW( + "With only 2 permutations registered (no -a option), " + "triples should have at most two variables. " + "Not the case in: " + + node.triple_.asString()); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::seedFromOrdinaryTriple( + const TripleGraph::Node& node, const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan) { + if (node._variables.size() == 1) { + indexScanSingleVarCase(node, pushPlan, addIndexScan); + } else if (node._variables.size() == 2) { + indexScanTwoVarsCase(node, addIndexScan); + } else { + indexScanThreeVarsCase(node, addIndexScan); + } +} + // _____________________________________________________________________________ vector QueryPlanner::seedWithScansAndText( const QueryPlanner::TripleGraph& tg, @@ -696,30 +841,30 @@ vector QueryPlanner::seedWithScansAndText( for (size_t i = 0; i < tg._nodeMap.size(); ++i) { const TripleGraph::Node& node = *tg._nodeMap.find(i)->second; - auto pushPlan = [&](SubtreePlan plan) { + auto pushPlan = [&seeds, i](SubtreePlan plan) { plan._idsOfIncludedNodes = (uint64_t(1) << i); seeds.push_back(std::move(plan)); }; - auto addIndexScan = [&](Permutation::Enum permutation) { - pushPlan(makeSubtreePlan(_qec, permutation, node._triple)); + auto addIndexScan = [this, pushPlan, node](Permutation::Enum permutation) { + pushPlan(makeSubtreePlan(_qec, permutation, node.triple_)); }; using enum Permutation::Enum; - if (node._cvar.has_value()) { + if (node.isTextNode()) { seeds.push_back(getTextLeafPlan(node)); continue; } if (node._variables.empty()) { AD_THROW("Triples should have at least one variable. Not the case in: " + - node._triple.asString()); + node.triple_.asString()); } // If the predicate is a property path, we have to recursively set up the // index scans. - if (node._triple._p._operation != PropertyPath::Operation::IRI) { - for (SubtreePlan& plan : seedFromPropertyPathTriple(node._triple)) { + if (node.triple_._p._operation != PropertyPath::Operation::IRI) { + for (SubtreePlan& plan : seedFromPropertyPathTriple(node.triple_)) { pushPlan(std::move(plan)); } continue; @@ -728,7 +873,7 @@ vector QueryPlanner::seedWithScansAndText( // At this point, we know that the predicate is a simple IRI or a variable. if (_qec && !_qec->getIndex().hasAllPermutations() && - isVariable(node._triple._p._iri)) { + isVariable(node.triple_._p._iri)) { AD_THROW( "The query contains a predicate variable, but only the PSO " "and POS permutations were loaded. Rerun the server without " @@ -736,74 +881,12 @@ vector QueryPlanner::seedWithScansAndText( "necessary also rebuild the index."); } - if (node._triple._p._iri == HAS_PREDICATE_PREDICATE) { - pushPlan(makeSubtreePlan(_qec, node._triple)); + if (node.triple_._p._iri == HAS_PREDICATE_PREDICATE) { + pushPlan(makeSubtreePlan(_qec, node.triple_)); continue; } - if (node._variables.size() == 1) { - // There is exactly one variable in the triple (may occur twice). - if (isVariable(node._triple._s) && isVariable(node._triple._o) && - node._triple._s == node._triple._o) { - if (isVariable(node._triple._p._iri)) { - AD_THROW("Triple with one variable repeated three times"); - } - LOG(DEBUG) << "Subject variable same as object variable" << std::endl; - // Need to handle this as IndexScan with a new unique - // variable + Filter. Works in both directions - Variable filterVar = generateUniqueVarName(); - auto scanTriple = node._triple; - scanTriple._o = filterVar; - auto scanTree = makeExecutionTree(_qec, PSO, scanTriple); - // The simplest way to set up the filtering expression is to use the - // parser. - std::string filterString = - absl::StrCat("FILTER (", scanTriple._s.getVariable().name(), "=", - filterVar.name(), ")"); - auto filter = sparqlParserHelpers::ParserAndVisitor{filterString} - .parseTypesafe(&SparqlAutomaticParser::filterR) - .resultOfParse_; - auto plan = makeSubtreePlan(_qec, scanTree, - std::move(filter.expression_)); - pushPlan(std::move(plan)); - } else if (isVariable(node._triple._s)) { - addIndexScan(POS); - } else if (isVariable(node._triple._o)) { - addIndexScan(PSO); - } else { - AD_CONTRACT_CHECK(isVariable(node._triple._p)); - addIndexScan(SOP); - } - } else if (node._variables.size() == 2) { - // Add plans for both possible scan directions. - if (!isVariable(node._triple._p._iri)) { - addIndexScan(PSO); - addIndexScan(POS); - } else if (!isVariable(node._triple._s)) { - addIndexScan(SPO); - addIndexScan(SOP); - } else if (!isVariable(node._triple._o)) { - addIndexScan(OSP); - addIndexScan(OPS); - } - } else { - // The current triple contains three distinct variables. - if (!_qec || _qec->getIndex().hasAllPermutations()) { - // Add plans for all six permutations. - addIndexScan(OPS); - addIndexScan(OSP); - addIndexScan(PSO); - addIndexScan(POS); - addIndexScan(SPO); - addIndexScan(SOP); - } else { - AD_THROW( - "With only 2 permutations registered (no -a option), " - "triples should have at most two variables. " - "Not the case in: " + - node._triple.asString()); - } - } + seedFromOrdinaryTriple(node, pushPlan, addIndexScan); } return seeds; } @@ -973,11 +1056,29 @@ Variable QueryPlanner::generateUniqueVarName() { // _____________________________________________________________________________ QueryPlanner::SubtreePlan QueryPlanner::getTextLeafPlan( const QueryPlanner::TripleGraph::Node& node) const { + AD_CONTRACT_CHECK(node.wordPart_.has_value()); + string word = node.wordPart_.value(); SubtreePlan plan(_qec); - plan._idsOfIncludedNodes |= (size_t(1) << node._id); - AD_CONTRACT_CHECK(node._wordPart.has_value()); - plan._qet = makeExecutionTree( - _qec, node._wordPart.value(), node._variables, node._cvar.value()); + if (node.triple_._p._iri == CONTAINS_ENTITY_PREDICATE) { + if (node._variables.size() == 2) { + // TODO: This is not nice, refactor the whole TripleGraph class + // to make these checks more explicity. + Variable evar = *(node._variables.begin()) == node.cvar_.value() + ? *(++node._variables.begin()) + : *(node._variables.begin()); + plan = makeSubtreePlan(_qec, node.cvar_.value(), + evar, word); + } else { + // Fixed entity case + AD_CORRECTNESS_CHECK(node._variables.size() == 1); + plan = makeSubtreePlan( + _qec, node.cvar_.value(), node.triple_._o.toString(), word); + } + } else { + plan = + makeSubtreePlan(_qec, node.cvar_.value(), word); + } + plan._idsOfIncludedNodes |= (size_t(1) << node.id_); return plan; } @@ -1043,13 +1144,12 @@ vector QueryPlanner::merge( string QueryPlanner::TripleGraph::asString() const { std::ostringstream os; for (size_t i = 0; i < _adjLists.size(); ++i) { - if (!_nodeMap.find(i)->second->_cvar.has_value()) { - os << i << " " << _nodeMap.find(i)->second->_triple.asString() << " : ("; + if (!_nodeMap.find(i)->second->cvar_.has_value()) { + os << i << " " << _nodeMap.find(i)->second->triple_.asString() << " : ("; } else { os << i << " {TextOP for " - << _nodeMap.find(i)->second->_cvar.value().name() << ", wordPart: \"" - << absl::StrJoin(_nodeMap.find(i)->second->_wordPart.value(), " ") - << "\"} : ("; + << _nodeMap.find(i)->second->cvar_.value().name() << ", wordPart: \"" + << _nodeMap.find(i)->second->wordPart_.value() << "\"} : ("; } for (size_t j = 0; j < _adjLists[i].size(); ++j) { @@ -1286,24 +1386,9 @@ vector> QueryPlanner::fillDpTab( // _____________________________________________________________________________ bool QueryPlanner::TripleGraph::isTextNode(size_t i) const { return _nodeMap.count(i) > 0 && - (_nodeMap.find(i)->second->_triple._p._iri == + (_nodeMap.find(i)->second->triple_._p._iri == CONTAINS_ENTITY_PREDICATE || - _nodeMap.find(i)->second->_triple._p._iri == CONTAINS_WORD_PREDICATE); -} - -// _____________________________________________________________________________ -ad_utility::HashMap> -QueryPlanner::TripleGraph::identifyTextCliques() const { - ad_utility::HashMap> contextVarToTextNodesIds; - // Fill contextVar -> triples map - for (size_t i = 0; i < _adjLists.size(); ++i) { - if (isTextNode(i)) { - auto& triple = _nodeMap.find(i)->second->_triple; - auto& cvar = triple._s; - contextVarToTextNodesIds[cvar.getVariable()].push_back(i); - } - } - return contextVarToTextNodesIds; + _nodeMap.find(i)->second->triple_._p._iri == CONTAINS_WORD_PREDICATE); } // _____________________________________________________________________________ @@ -1432,7 +1517,7 @@ QueryPlanner::TripleGraph::TripleGraph( const std::vector>>& init) { for (const std::pair>& p : init) { _nodeStorage.push_back(p.first); - _nodeMap[p.first._id] = &_nodeStorage.back(); + _nodeMap[p.first.id_] = &_nodeStorage.back(); _adjLists.push_back(p.second); } } @@ -1451,7 +1536,7 @@ QueryPlanner::TripleGraph::TripleGraph(const QueryPlanner::TripleGraph& other, if (keep.count(i) > 0) { _nodeStorage.push_back(*other._nodeMap.find(i)->second); idChange[i] = _nodeMap.size(); - _nodeStorage.back()._id = _nodeMap.size(); + _nodeStorage.back().id_ = _nodeMap.size(); _nodeMap[idChange[i]] = &_nodeStorage.back(); } } @@ -1493,123 +1578,6 @@ QueryPlanner::TripleGraph& QueryPlanner::TripleGraph::operator=( QueryPlanner::TripleGraph::TripleGraph() : _adjLists(), _nodeMap(), _nodeStorage() {} -// ___________________________________________________________________________ -namespace { - -// Remove the quotation marks around an enquoted literal and convert it to lower -// case. This is only used in the `collapseTextCliques` function. -string stripAndLowercaseLiteral(std::string_view lit) { - AD_CORRECTNESS_CHECK(lit.size() >= 2 && lit.starts_with('"') && - lit.ends_with('"')); - lit.remove_prefix(1); - lit.remove_suffix(1); - return ad_utility::utf8ToLower(lit); -} -} // namespace - -// _____________________________________________________________________________ -void QueryPlanner::TripleGraph::collapseTextCliques() { - // TODO: Could use more refactoring. - - // Create a map from context var to triples it occurs in (the cliques). - ad_utility::HashMap> cvarsToTextNodes( - identifyTextCliques()); - if (cvarsToTextNodes.empty()) { - return; - } - // Now turn each such clique into a new node the represents that whole - // text operation clique. - size_t id = 0; - vector textNodes; - ad_utility::HashMap removedNodeIds; - vector> tnAdjSetsToOldIds; - for (auto& cvarsToTextNode : cvarsToTextNodes) { - auto& cvar = cvarsToTextNode.first; - std::vector words; - vector trips; - tnAdjSetsToOldIds.emplace_back(); - auto& adjNodes = tnAdjSetsToOldIds.back(); - for (auto nid : cvarsToTextNode.second) { - removedNodeIds[nid] = id; - adjNodes.insert(_adjLists[nid].begin(), _adjLists[nid].end()); - auto& triple = _nodeMap[nid]->_triple; - trips.push_back(triple); - // TODO I think the check "is the predicate ql:contains_word" is - // missing. Verify this. - if (triple._s == cvar && triple._o.isLiteral()) { - std::vector newWords = absl::StrSplit( - stripAndLowercaseLiteral( - triple._o.getLiteral().normalizedLiteralContent().get()), - ' '); - words.insert(words.end(), newWords.begin(), newWords.end()); - } - } - textNodes.emplace_back(id, cvar, std::move(words), trips); - ++id; - assert(tnAdjSetsToOldIds.size() == id); - } - - // Finally update the graph (node ids and adj lists). - vector> oldAdjLists = _adjLists; - std::list oldNodeStorage = _nodeStorage; - _nodeStorage.clear(); - _nodeMap.clear(); - _adjLists.clear(); - ad_utility::HashMap idMapOldToNew; - ad_utility::HashMap idMapNewToOld; - - // Storage and ids. - for (auto& tn : textNodes) { - _nodeStorage.push_back(tn); - _nodeMap[tn._id] = &_nodeStorage.back(); - } - - for (auto& n : oldNodeStorage) { - if (removedNodeIds.count(n._id) == 0) { - idMapOldToNew[n._id] = id; - idMapNewToOld[id] = n._id; - n._id = id++; - _nodeStorage.push_back(n); - _nodeMap[n._id] = &_nodeStorage.back(); - } - } - - // Adj lists - // First for newly created text nodes. - for (size_t i = 0; i < tnAdjSetsToOldIds.size(); ++i) { - const auto& nodes = tnAdjSetsToOldIds[i]; - std::set adjNodes; - for (auto nid : nodes) { - if (removedNodeIds.count(nid) == 0) { - adjNodes.insert(idMapOldToNew[nid]); - } else if (removedNodeIds[nid] != i) { - adjNodes.insert(removedNodeIds[nid]); - } - } - vector adjList; - adjList.insert(adjList.begin(), adjNodes.begin(), adjNodes.end()); - _adjLists.emplace_back(adjList); - } - assert(_adjLists.size() == textNodes.size()); - assert(_adjLists.size() == tnAdjSetsToOldIds.size()); - // Then for remaining (regular) nodes. - for (size_t i = textNodes.size(); i < _nodeMap.size(); ++i) { - const Node& node = *_nodeMap[i]; - const auto& oldAdjList = oldAdjLists[idMapNewToOld[node._id]]; - std::set adjNodes; - for (auto nid : oldAdjList) { - if (removedNodeIds.count(nid) == 0) { - adjNodes.insert(idMapOldToNew[nid]); - } else { - adjNodes.insert(removedNodeIds[nid]); - } - } - vector adjList; - adjList.insert(adjList.begin(), adjNodes.begin(), adjNodes.end()); - _adjLists.emplace_back(adjList); - } -} - // _____________________________________________________________________________ bool QueryPlanner::TripleGraph::isSimilar( const QueryPlanner::TripleGraph& other) const { @@ -1629,8 +1597,8 @@ bool QueryPlanner::TripleGraph::isSimilar( bool hasMatch = false; for (const Node& n2 : other._nodeStorage) { if (n.isSimilar(n2)) { - id_map[n._id] = n2._id; - id_map_reverse[n2._id] = n._id; + id_map[n.id_] = n2.id_; + id_map_reverse[n2.id_] = n.id_; hasMatch = true; break; } else { diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index e09be794ad..85adb09fb9 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -36,40 +36,22 @@ class QueryPlanner { TripleGraph(const TripleGraph& other, vector keepNodes); struct Node { - Node(size_t id, SparqlTriple t) : _id(id), _triple(std::move(t)) { - if (isVariable(_triple._s)) { - _variables.insert(_triple._s.getVariable()); + Node(size_t id, SparqlTriple t) : id_(id), triple_(std::move(t)) { + if (isVariable(triple_._s)) { + _variables.insert(triple_._s.getVariable()); } - if (isVariable(_triple._p)) { - _variables.insert(Variable{_triple._p._iri}); + if (isVariable(triple_._p)) { + _variables.insert(Variable{triple_._p._iri}); } - if (isVariable(_triple._o)) { - _variables.insert(_triple._o.getVariable()); + if (isVariable(triple_._o)) { + _variables.insert(triple_._o.getVariable()); } } - Node(size_t id, const Variable& cvar, std::vector words, - const vector& trips) - : _id(id), - // TODO What is this triple used for? If it is just a - // dummy, then we can replace it by a `variant`. - _triple(cvar, PropertyPath::fromIri(INTERNAL_TEXT_MATCH_PREDICATE), - TripleComponent::UNDEF{}), - _cvar(cvar), - _wordPart(std::move(words)) { - _variables.insert(cvar); - for (const auto& t : trips) { - if (isVariable(t._s)) { - _variables.insert(t._s.getVariable()); - } - if (isVariable(t._p)) { - _variables.insert(Variable{t._p._iri}); - } - if (isVariable(t._o)) { - _variables.insert(t._o.getVariable()); - } - } + Node(size_t id, Variable cvar, std::string word, SparqlTriple t) + : Node(id, std::move(t)) { + cvar_ = std::move(cvar); + wordPart_ = std::move(word); } Node(const Node& other) = default; @@ -79,30 +61,32 @@ class QueryPlanner { // Returns true if the two nodes equal apart from the id // and the order of variables bool isSimilar(const Node& other) const { - return _triple == other._triple && _cvar == other._cvar && - _wordPart == other._wordPart && _variables == other._variables; + return triple_ == other.triple_ && cvar_ == other.cvar_ && + wordPart_ == other.wordPart_ && _variables == other._variables; } + bool isTextNode() const { return cvar_.has_value(); } + friend std::ostream& operator<<(std::ostream& out, const Node& n) { - out << "id: " << n._id << " triple: " << n._triple.asString() + out << "id: " << n.id_ << " triple: " << n.triple_.asString() << " vars_ "; for (const auto& s : n._variables) { out << s.name() << ", "; } // TODO Should the `cvar` and the `wordPart` be stored // together? - if (n._cvar.has_value()) { - out << " cvar " << n._cvar.value().name() << " wordPart " - << absl::StrJoin(n._wordPart.value(), " "); + if (n.cvar_.has_value()) { + out << " cvar " << n.cvar_.value().name() << " wordPart " + << n.wordPart_.value(); } return out; } - size_t _id; - SparqlTriple _triple; + size_t id_; + SparqlTriple triple_; ad_utility::HashSet _variables; - std::optional _cvar = std::nullopt; - std::optional> _wordPart = std::nullopt; + std::optional cvar_ = std::nullopt; + std::optional wordPart_ = std::nullopt; }; // Allows for manually building triple graphs for testing @@ -119,13 +103,9 @@ class QueryPlanner { ad_utility::HashMap _nodeMap; std::list _nodeStorage; - ad_utility::HashMap> identifyTextCliques() const; - vector bfsLeaveOut(size_t startNode, ad_utility::HashSet leaveOut) const; - void collapseTextCliques(); - private: vector>> splitAtContextVars( const vector& origFilters, @@ -219,6 +199,8 @@ class QueryPlanner { [[nodiscard]] TripleGraph createTripleGraph( const parsedQuery::BasicGraphPattern* pattern) const; + void addNodeToTripleGraph(const TripleGraph::Node&, TripleGraph&) const; + void setEnablePatternTrick(bool enablePatternTrick); // Create a set of possible execution trees for the given parsed query. The @@ -242,6 +224,30 @@ class QueryPlanner { [[nodiscard]] std::vector optimize( ParsedQuery::GraphPattern* rootPattern); + // Add all the possible index scans for the triple represented by the node. + // The triple is "ordinary" in the sense that it is neither a text triple with + // ql:contains-word nor a special pattern trick triple. + template + void seedFromOrdinaryTriple(const TripleGraph::Node& node, + const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan); + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanSingleVarCase(const TripleGraph::Node& node, + const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan); + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanTwoVarsCase(const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const; + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanThreeVarsCase(const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const; + /** * @brief Fills children with all operations that are associated with a single * node in the triple graph (e.g. IndexScans). diff --git a/src/engine/TextIndexScanForEntity.cpp b/src/engine/TextIndexScanForEntity.cpp new file mode 100644 index 0000000000..352ecde5e1 --- /dev/null +++ b/src/engine/TextIndexScanForEntity.cpp @@ -0,0 +1,110 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include "engine/TextIndexScanForEntity.h" + +// _____________________________________________________________________________ +TextIndexScanForEntity::TextIndexScanForEntity( + QueryExecutionContext* qec, Variable textRecordVar, + std::variant entity, string word) + : Operation(qec), + textRecordVar_(std::move(textRecordVar)), + varOrFixed_(qec, std::move(entity)), + word_(std::move(word)) {} + +// _____________________________________________________________________________ +ResultTable TextIndexScanForEntity::computeResult() { + IdTable idTable = getExecutionContext()->getIndex().getEntityMentionsForWord( + word_, getExecutionContext()->getAllocator()); + + if (hasFixedEntity()) { + auto beginErase = std::ranges::remove_if(idTable, [this](const auto& row) { + return row[1].getVocabIndex() != getVocabIndexOfFixedEntity(); + }); + idTable.erase(beginErase.begin(), idTable.end()); + idTable.setColumnSubset(std::vector{0, 2}); + } + + // Add details to the runtimeInfo. This is has no effect on the result. + if (hasFixedEntity()) { + runtimeInfo().addDetail("fixed entity: ", fixedEntity()); + } else { + runtimeInfo().addDetail("entity var: ", entityVariable().name()); + } + runtimeInfo().addDetail("word: ", word_); + + return {std::move(idTable), resultSortedOn(), LocalVocab{}}; +} + +// _____________________________________________________________________________ +VariableToColumnMap TextIndexScanForEntity::computeVariableToColumnMap() const { + VariableToColumnMap vcmap; + auto addDefinedVar = [&vcmap, + index = ColumnIndex{0}](const Variable& var) mutable { + vcmap[var] = makeAlwaysDefinedColumn(index); + ++index; + }; + addDefinedVar(textRecordVar_); + if (hasFixedEntity()) { + addDefinedVar(textRecordVar_.getScoreVariable(fixedEntity())); + } else { + addDefinedVar(entityVariable()); + addDefinedVar(textRecordVar_.getScoreVariable(entityVariable())); + } + return vcmap; +} + +// _____________________________________________________________________________ +size_t TextIndexScanForEntity::getResultWidth() const { + return 2 + (hasFixedEntity() ? 0 : 1); +} + +// _____________________________________________________________________________ +size_t TextIndexScanForEntity::getCostEstimate() { + if (hasFixedEntity()) { + // We currently have to first materialize and then filter the complete list + // for the fixed entity + return 2 * getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } else { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } +} + +// _____________________________________________________________________________ +uint64_t TextIndexScanForEntity::getSizeEstimateBeforeLimit() { + if (hasFixedEntity()) { + return static_cast( + getExecutionContext()->getIndex().getAverageNofEntityContexts()); + } else { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } +} + +// _____________________________________________________________________________ +bool TextIndexScanForEntity::knownEmptyResult() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_) == 0; +} + +// _____________________________________________________________________________ +vector TextIndexScanForEntity::resultSortedOn() const { + return {ColumnIndex(0)}; +} + +// _____________________________________________________________________________ +string TextIndexScanForEntity::getDescriptor() const { + return absl::StrCat("TextIndexScanForEntity on ", textRecordVar_.name()); +} + +// _____________________________________________________________________________ +string TextIndexScanForEntity::getCacheKeyImpl() const { + std::ostringstream os; + os << "ENTITY INDEX SCAN FOR WORD: " + << " with word: \"" << word_ << "\" and fixed-entity: \"" + << (hasFixedEntity() ? fixedEntity() : "no fixed-entity") << " \""; + return std::move(os).str(); +} diff --git a/src/engine/TextIndexScanForEntity.h b/src/engine/TextIndexScanForEntity.h new file mode 100644 index 0000000000..155a962f12 --- /dev/null +++ b/src/engine/TextIndexScanForEntity.h @@ -0,0 +1,111 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +#include + +#include "./Operation.h" + +// This operation retrieves all text records and their corresponding +// entities from the fulltext index that contain a certain word or prefix. +// The entities are saved to the entityVar_. If the operation is called on a +// fixed entity instead, it only returns entries that contain this entity. +class TextIndexScanForEntity : public Operation { + using FixedEntity = std::pair; + + struct VarOrFixedEntity { + std::variant entity_; + + static std::variant makeEntityVariant( + const QueryExecutionContext* qec, + std::variant entity) { + if (std::holds_alternative(entity)) { + VocabIndex index; + std::string fixedEntity = std::move(std::get(entity)); + bool success = qec->getIndex().getVocab().getId(fixedEntity, &index); + if (!success) { + throw std::runtime_error( + "The entity " + fixedEntity + + " is not part of the underlying knowledge graph and can " + "therefore not be used as the object of ql:contains-entity"); + } + return FixedEntity(std::move(fixedEntity), std::move(index)); + } else { + return std::get(entity); + } + }; + + VarOrFixedEntity(const QueryExecutionContext* qec, + std::variant entity) + : entity_(makeEntityVariant(qec, std::move(entity))) {} + + ~VarOrFixedEntity() = default; + + bool hasFixedEntity() const { + return std::holds_alternative(entity_); + } + }; + + const Variable textRecordVar_; + const VarOrFixedEntity varOrFixed_; + const string word_; + + public: + TextIndexScanForEntity(QueryExecutionContext* qec, Variable textRecordVar, + std::variant entity, + string word); + ~TextIndexScanForEntity() override = default; + + bool hasFixedEntity() const { return varOrFixed_.hasFixedEntity(); } + + const std::string& fixedEntity() const { + AD_CONTRACT_CHECK(hasFixedEntity()); + return std::get(varOrFixed_.entity_).first; + } + + const Variable& entityVariable() const { + AD_CONTRACT_CHECK(!hasFixedEntity()); + return std::get(varOrFixed_.entity_); + } + + const Variable& textRecordVar() const { return textRecordVar_; } + + const std::string& word() const { return word_; } + + string getCacheKeyImpl() const override; + + string getDescriptor() const override; + + size_t getResultWidth() const override; + + void setTextLimit(size_t) override { + // TODO: implement textLimit + } + + size_t getCostEstimate() override; + + uint64_t getSizeEstimateBeforeLimit() override; + + float getMultiplicity(size_t col) override { + (void)col; + return 1; + } + + bool knownEmptyResult() override; + + vector resultSortedOn() const override; + + VariableToColumnMap computeVariableToColumnMap() const override; + + private: + const VocabIndex& getVocabIndexOfFixedEntity() const { + AD_CONTRACT_CHECK(hasFixedEntity()); + return std::get(varOrFixed_.entity_).second; + } + + ResultTable computeResult() override; + + vector getChildren() override { return {}; } +}; diff --git a/src/engine/TextIndexScanForWord.cpp b/src/engine/TextIndexScanForWord.cpp new file mode 100644 index 0000000000..c490a88c6f --- /dev/null +++ b/src/engine/TextIndexScanForWord.cpp @@ -0,0 +1,82 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include "engine/TextIndexScanForWord.h" + +// _____________________________________________________________________________ +TextIndexScanForWord::TextIndexScanForWord(QueryExecutionContext* qec, + Variable textRecordVar, string word) + : Operation(qec), + textRecordVar_(std::move(textRecordVar)), + word_(std::move(word)), + isPrefix_(word_.ends_with('*')) {} + +// _____________________________________________________________________________ +ResultTable TextIndexScanForWord::computeResult() { + IdTable idTable = getExecutionContext()->getIndex().getWordPostingsForTerm( + word_, getExecutionContext()->getAllocator()); + + if (!isPrefix_) { + IdTable smallIdTable{getExecutionContext()->getAllocator()}; + smallIdTable.setNumColumns(1); + smallIdTable.resize(idTable.numRows()); + std::ranges::copy(idTable.getColumn(0), smallIdTable.getColumn(0).begin()); + + return {std::move(smallIdTable), resultSortedOn(), LocalVocab{}}; + } + + // Add details to the runtimeInfo. This is has no effect on the result. + runtimeInfo().addDetail("word: ", word_); + + return {std::move(idTable), resultSortedOn(), LocalVocab{}}; +} + +// _____________________________________________________________________________ +VariableToColumnMap TextIndexScanForWord::computeVariableToColumnMap() const { + VariableToColumnMap vcmap; + auto addDefinedVar = [&vcmap, + index = ColumnIndex{0}](const Variable& var) mutable { + vcmap[var] = makeAlwaysDefinedColumn(index); + ++index; + }; + addDefinedVar(textRecordVar_); + if (isPrefix_) { + addDefinedVar(textRecordVar_.getMatchingWordVariable( + std::string_view(word_).substr(0, word_.size() - 1))); + } + return vcmap; +} + +// _____________________________________________________________________________ +size_t TextIndexScanForWord::getResultWidth() const { + return 1 + (isPrefix_ ? 1 : 0); +} + +// _____________________________________________________________________________ +size_t TextIndexScanForWord::getCostEstimate() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForWord(word_); +} + +// _____________________________________________________________________________ +uint64_t TextIndexScanForWord::getSizeEstimateBeforeLimit() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForWord(word_); +} + +// _____________________________________________________________________________ +vector TextIndexScanForWord::resultSortedOn() const { + return {ColumnIndex(0)}; +} + +// _____________________________________________________________________________ +string TextIndexScanForWord::getDescriptor() const { + return absl::StrCat("TextIndexScanForWord on ", textRecordVar_.name()); +} + +// _____________________________________________________________________________ +string TextIndexScanForWord::getCacheKeyImpl() const { + std::ostringstream os; + os << "WORD INDEX SCAN: " + << " with word: \"" << word_ << "\""; + return std::move(os).str(); +} diff --git a/src/engine/TextIndexScanForWord.h b/src/engine/TextIndexScanForWord.h new file mode 100644 index 0000000000..53b3f56757 --- /dev/null +++ b/src/engine/TextIndexScanForWord.h @@ -0,0 +1,60 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +#include + +#include "./Operation.h" + +// This operation retrieves all text records from the fulltext index that +// contain a certain word or prefix. +class TextIndexScanForWord : public Operation { + private: + const Variable textRecordVar_; + const string word_; + bool isPrefix_ = false; + + public: + TextIndexScanForWord(QueryExecutionContext* qec, Variable textRecordVar, + string word); + + ~TextIndexScanForWord() override = default; + + const Variable& textRecordVar() const { return textRecordVar_; } + + const std::string& word() const { return word_; } + + string getCacheKeyImpl() const override; + + string getDescriptor() const override; + + size_t getResultWidth() const override; + + void setTextLimit(size_t) override { + // TODO: implement textLimit + } + + size_t getCostEstimate() override; + + uint64_t getSizeEstimateBeforeLimit() override; + + float getMultiplicity(size_t col) override { + (void)col; + return 1; + } + + bool knownEmptyResult() override { return getSizeEstimateBeforeLimit() == 0; } + + vector resultSortedOn() const override; + + VariableToColumnMap computeVariableToColumnMap() const override; + + private: + // Returns a ResultTable containing an IdTable with the columns being + // the text variable and the completed word (if it was prefixed) + ResultTable computeResult() override; + + vector getChildren() override { return {}; } +}; diff --git a/src/global/Constants.h b/src/global/Constants.h index e74407d327..7584c3a028 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -71,6 +71,8 @@ static const std::string INTERNAL_VARIABLE_PREFIX = "?_QLever_internal_variable_"; static constexpr std::string_view TEXTSCORE_VARIABLE_PREFIX = "?ql_textscore_"; +static constexpr std::string_view ENTITY_VARIABLE_PREFIX = "?ql_entity_"; +static constexpr std::string_view SCORE_VARIABLE_PREFIX = "?ql_score_"; static constexpr std::string_view MATCHINGWORD_VARIABLE_PREFIX = "?ql_matchingword_"; diff --git a/src/index/FTSAlgorithms.cpp b/src/index/FTSAlgorithms.cpp index 1698622600..30552b1b63 100644 --- a/src/index/FTSAlgorithms.cpp +++ b/src/index/FTSAlgorithms.cpp @@ -16,7 +16,7 @@ using std::pair; // _____________________________________________________________________________ -Index::WordEntityPostings FTSAlgorithms::filterByRange( +Index::WordEntityPostings FTSAlgorithms::filterByRangeWep( const IdRange& idRange, const WordEntityPostings& wepPreFilter) { AD_CONTRACT_CHECK(wepPreFilter.wids_.size() == 1); @@ -63,6 +63,46 @@ Index::WordEntityPostings FTSAlgorithms::filterByRange( return wepResult; } +// _____________________________________________________________________________ +IdTable FTSAlgorithms::filterByRange(const IdRange& idRange, + const IdTable& idTablePreFilter) { + AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 2); + LOG(DEBUG) << "Filtering " << idTablePreFilter.getColumn(0).size() + << " elements by ID range...\n"; + + IdTable idTableResult{idTablePreFilter.getAllocator()}; + idTableResult.setNumColumns(2); + idTableResult.resize(idTablePreFilter.getColumn(0).size()); + + decltype(auto) resultCidColumn = idTableResult.getColumn(0); + decltype(auto) resultWidColumn = idTableResult.getColumn(1); + size_t nofResultElements = 0; + decltype(auto) preFilterCidColumn = idTablePreFilter.getColumn(0); + decltype(auto) preFilterWidColumn = idTablePreFilter.getColumn(1); + // TODO Use views::zip. + for (size_t i = 0; i < preFilterWidColumn.size(); ++i) { + // TODO proper Ids for the text stuff. + // The mapping from words that appear in text records to `WordIndex`es is + // stored in a `Vocabulary` that stores `VocabIndex`es, so we have to + // convert between those two types. + // TODO Can we make the returned `IndexType` a template parameter + // of the vocabulary, s.t. we have a vocabulary that stores `WordIndex`es + // directly? + if (preFilterWidColumn[i].getWordVocabIndex() >= idRange.first() && + preFilterWidColumn[i].getWordVocabIndex() <= idRange.last()) { + resultCidColumn[nofResultElements] = preFilterCidColumn[i]; + resultWidColumn[nofResultElements] = preFilterWidColumn[i]; + nofResultElements++; + } + } + + idTableResult.resize(nofResultElements); + + LOG(DEBUG) << "Filtering by ID range done. Result has " + << idTableResult.numRows() << " elements.\n"; + return idTableResult; +} + // _____________________________________________________________________________ Index::WordEntityPostings FTSAlgorithms::crossIntersect( const WordEntityPostings& matchingContextsWep, diff --git a/src/index/FTSAlgorithms.h b/src/index/FTSAlgorithms.h index 73701919ae..fc753eb2cf 100644 --- a/src/index/FTSAlgorithms.h +++ b/src/index/FTSAlgorithms.h @@ -25,10 +25,15 @@ class FTSAlgorithms { public: // Filters all wep entries out where the wid does not lay inside the // idRange. - static WordEntityPostings filterByRange( + static WordEntityPostings filterByRangeWep( const IdRange& idRange, const WordEntityPostings& wepPreFilter); + // Filters all IdTable entries out where the WordIndex does not lay inside the + // idRange. + static IdTable filterByRange(const IdRange& idRange, + const IdTable& idPreFilter); + // Intersects matchingContextsWep and eBlockWep on the cids_ attribute. If // there are multiple matches for the same cid then we calculate every // possible combination of eids and wids. diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 29db81f326..4aabb49e03 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -122,6 +122,16 @@ std::string_view Index::wordIdToString(WordIndex wordIndex) const { return pimpl_->wordIdToString(wordIndex); } +// ____________________________________________________________________________ +size_t Index::getSizeOfTextBlockForWord(const std::string& word) const { + return pimpl_->getSizeOfTextBlockForWord(word); +} + +// ____________________________________________________________________________ +size_t Index::getSizeOfTextBlockForEntities(const std::string& word) const { + return pimpl_->getSizeOfTextBlockForEntities(word); +} + // ____________________________________________________________________________ size_t Index::getSizeEstimate(const std::string& words) const { return pimpl_->getSizeEstimate(words); @@ -170,9 +180,10 @@ Index::WordEntityPostings Index::getContextEntityScoreListsForWords( } // ____________________________________________________________________________ -Index::WordEntityPostings Index::getWordPostingsForTerm( - const std::string& term) const { - return pimpl_->getWordPostingsForTerm(term); +IdTable Index::getWordPostingsForTerm( + const std::string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + return pimpl_->getWordPostingsForTerm(term, allocator); } // ____________________________________________________________________________ @@ -181,6 +192,18 @@ Index::WordEntityPostings Index::getEntityPostingsForTerm( return pimpl_->getEntityPostingsForTerm(term); } +// ____________________________________________________________________________ +IdTable Index::getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + return pimpl_->getEntityMentionsForWord(term, allocator); +} + +// ____________________________________________________________________________ +size_t Index::getIndexOfBestSuitedElTerm(const vector& terms) const { + return pimpl_->getIndexOfBestSuitedElTerm(terms); +} + // ____________________________________________________________________________ std::string Index::getTextExcerpt(TextRecordIndex cid) const { return pimpl_->getTextExcerpt(cid); diff --git a/src/index/Index.h b/src/index/Index.h index 58d13d5eab..1adfcc6c57 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -157,6 +157,11 @@ class Index { // -------------------------------------------------------------------------- [[nodiscard]] std::string_view wordIdToString(WordIndex wordIndex) const; + [[nodiscard]] size_t getSizeOfTextBlockForWord(const std::string& word) const; + + [[nodiscard]] size_t getSizeOfTextBlockForEntities( + const std::string& word) const; + [[nodiscard]] size_t getSizeEstimate(const std::string& words) const; void getContextListForWords(const std::string& words, IdTable* result) const; @@ -184,10 +189,18 @@ class Index { WordEntityPostings getContextEntityScoreListsForWords( const std::string& words) const; - WordEntityPostings getWordPostingsForTerm(const std::string& term) const; + IdTable getWordPostingsForTerm( + const std::string& term, + const ad_utility::AllocatorWithLimit& allocator) const; WordEntityPostings getEntityPostingsForTerm(const std::string& term) const; + IdTable getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const; + + size_t getIndexOfBestSuitedElTerm(const vector& terms) const; + [[nodiscard]] std::string getTextExcerpt(TextRecordIndex cid) const; // Only for debug reasons and external encoding tests. diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 870f1405f5..0654bc024d 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -747,12 +747,12 @@ void IndexImpl::getContextListForWords(const string& words, if (!term.ends_with('*')) { skipColumns.push_back(i); } - wepVecs.push_back(getWordPostingsForTerm(term)); + wepVecs.push_back(getWordPostingsForTermWep(term)); i++; } wep = FTSAlgorithms::crossIntersectKWay(wepVecs, nullptr); } else { - wep = getWordPostingsForTerm(terms[0]); + wep = getWordPostingsForTermWep(terms[0]); } AD_CONTRACT_CHECK(wep.wids_.size() >= terms.size()); @@ -787,7 +787,7 @@ void IndexImpl::getContextListForWords(const string& words, } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::readWordCl( +Index::WordEntityPostings IndexImpl::readWordClWep( const TextBlockMetaData& tbmd) const { Index::WordEntityPostings wep; wep.cids_ = readGapComprList( @@ -804,7 +804,30 @@ Index::WordEntityPostings IndexImpl::readWordCl( } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::readWordEntityCl( +IdTable IndexImpl::readWordCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const { + IdTable idTable{2, allocator}; + vector cids = readGapComprList( + tbmd._cl._nofElements, tbmd._cl._startContextlist, + static_cast(tbmd._cl._startWordlist - tbmd._cl._startContextlist), + &TextRecordIndex::make); + idTable.resize(cids.size()); + std::ranges::transform(cids, idTable.getColumn(0).begin(), + &Id::makeFromTextRecordIndex); + std::ranges::transform( + readFreqComprList( + tbmd._cl._nofElements, tbmd._cl._startWordlist, + static_cast(tbmd._cl._startScorelist - + tbmd._cl._startWordlist)), + idTable.getColumn(1).begin(), [](WordIndex id) { + return Id::makeFromWordVocabIndex(WordVocabIndex::make(id)); + }); + return idTable; +} + +// _____________________________________________________________________________ +Index::WordEntityPostings IndexImpl::readWordEntityClWep( const TextBlockMetaData& tbmd) const { Index::WordEntityPostings wep; wep.cids_ = readGapComprList( @@ -825,7 +848,36 @@ Index::WordEntityPostings IndexImpl::readWordEntityCl( } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( +IdTable IndexImpl::readWordEntityCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const { + IdTable idTable{3, allocator}; + vector cids = readGapComprList( + tbmd._entityCl._nofElements, tbmd._entityCl._startContextlist, + static_cast(tbmd._entityCl._startWordlist - + tbmd._entityCl._startContextlist), + &TextRecordIndex::make); + idTable.resize(cids.size()); + std::ranges::transform(cids, idTable.getColumn(0).begin(), + &Id::makeFromTextRecordIndex); + std::ranges::copy( + readFreqComprList(tbmd._entityCl._nofElements, + tbmd._entityCl._startWordlist, + static_cast(tbmd._entityCl._startScorelist - + tbmd._entityCl._startWordlist), + &Id::fromBits), + idTable.getColumn(1).begin()); + std::ranges::transform( + readFreqComprList( + tbmd._entityCl._nofElements, tbmd._entityCl._startScorelist, + static_cast(tbmd._entityCl._lastByte + 1 - + tbmd._entityCl._startScorelist)), + idTable.getColumn(2).begin(), &Id::makeFromInt); + return idTable; +} + +// _____________________________________________________________________________ +Index::WordEntityPostings IndexImpl::getWordPostingsForTermWep( const string& term) const { LOG(DEBUG) << "Getting word postings for term: " << term << '\n'; Index::WordEntityPostings wep; @@ -834,9 +886,9 @@ Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( return wep; } const auto& tbmd = optionalTbmd.value().tbmd_; - wep = readWordCl(tbmd); + wep = readWordClWep(tbmd); if (optionalTbmd.value().hasToBeFiltered_) { - wep = FTSAlgorithms::filterByRange(optionalTbmd.value().idRange_, wep); + wep = FTSAlgorithms::filterByRangeWep(optionalTbmd.value().idRange_, wep); } LOG(DEBUG) << "Word postings for term: " << term << ": cids: " << wep.cids_.size() << " scores " @@ -844,6 +896,27 @@ Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( return wep; } +// _____________________________________________________________________________ +IdTable IndexImpl::getWordPostingsForTerm( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + LOG(DEBUG) << "Getting word postings for term: " << term << '\n'; + IdTable idTable{allocator}; + auto optionalTbmd = getTextBlockMetadataForWordOrPrefix(term); + if (!optionalTbmd.has_value()) { + return idTable; + } + const auto& tbmd = optionalTbmd.value().tbmd_; + idTable = readWordCl(tbmd, allocator); + if (optionalTbmd.value().hasToBeFiltered_) { + idTable = + FTSAlgorithms::filterByRange(optionalTbmd.value().idRange_, idTable); + } + LOG(DEBUG) << "Word postings for term: " << term + << ": cids: " << idTable.getColumn(0).size() << '\n'; + return idTable; +} + // _____________________________________________________________________________ Index::WordEntityPostings IndexImpl::getContextEntityScoreListsForWords( const string& words) const { @@ -871,7 +944,7 @@ Index::WordEntityPostings IndexImpl::getContextEntityScoreListsForWords( skipColumns.push_back(i); } if (i != useElFromTerm) { - wepVecs.push_back(getWordPostingsForTerm(terms[i])); + wepVecs.push_back(getWordPostingsForTermWep(terms[i])); } } wepVecs.push_back(getEntityPostingsForTerm(terms[useElFromTerm])); @@ -1030,14 +1103,27 @@ Index::WordEntityPostings IndexImpl::getEntityPostingsForTerm( return resultWep; } const auto& tbmd = optTbmd.value().tbmd_; - Index::WordEntityPostings matchingContextsWep = getWordPostingsForTerm(term); + Index::WordEntityPostings matchingContextsWep = + getWordPostingsForTermWep(term); // Read the full lists - Index::WordEntityPostings eBlockWep = readWordEntityCl(tbmd); + Index::WordEntityPostings eBlockWep = readWordEntityClWep(tbmd); resultWep = FTSAlgorithms::crossIntersect(matchingContextsWep, eBlockWep); return resultWep; } +// _____________________________________________________________________________ +IdTable IndexImpl::getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + auto optTbmd = getTextBlockMetadataForWordOrPrefix(term); + if (!optTbmd.has_value()) { + return IdTable{allocator}; + } + const auto& tbmd = optTbmd.value().tbmd_; + return readWordEntityCl(tbmd, allocator); +} + // _____________________________________________________________________________ template vector IndexImpl::readGapComprList(size_t nofElements, off_t from, @@ -1380,6 +1466,30 @@ size_t IndexImpl::getIndexOfBestSuitedElTerm( return std::get<0>(toBeSorted[0]); } +// _____________________________________________________________________________ +size_t IndexImpl::getSizeOfTextBlockForEntities(const string& word) const { + if (word.empty()) { + return 0; + } + auto optTbmd = getTextBlockMetadataForWordOrPrefix(word); + if (!optTbmd.has_value()) { + return 0; + } + return optTbmd.value().tbmd_._entityCl._nofElements; +} + +// _____________________________________________________________________________ +size_t IndexImpl::getSizeOfTextBlockForWord(const string& word) const { + if (word.empty()) { + return 0; + } + auto optTbmd = getTextBlockMetadataForWordOrPrefix(word); + if (!optTbmd.has_value()) { + return 0; + } + return optTbmd.value().tbmd_._cl._nofElements; +} + // _____________________________________________________________________________ size_t IndexImpl::getSizeEstimate(const string& words) const { // TODO vector can be of type std::string_view if called functions @@ -1408,10 +1518,12 @@ auto IndexImpl::getTextBlockMetadataForWordOrPrefix(const std::string& word) AD_CORRECTNESS_CHECK(!word.empty()); IdRange idRange; if (word.ends_with(PREFIX_CHAR)) { - if (!textVocab_.getIdRangeForFullTextPrefix(word, &idRange)) { + auto idRangeOpt = textVocab_.getIdRangeForFullTextPrefix(word); + if (!idRangeOpt.has_value()) { LOG(INFO) << "Prefix: " << word << " not in vocabulary\n"; return std::nullopt; } + idRange = idRangeOpt.value(); } else { WordVocabIndex idx; if (!textVocab_.getId(word, &idx)) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8800d670a4..7bea6fe050 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -305,6 +305,17 @@ class IndexImpl { // -------------------------------------------------------------------------- std::string_view wordIdToString(WordIndex wordIndex) const; + size_t getSizeOfTextBlockForEntities(const string& words) const; + + // Returns the size of the whole textblock. If the word is very long or not + // prefixed then only a small number of words actually match. So the final + // result is much smaller. + // Note that as a cost estimate the estimation is correct. Because we always + // have to read the complete block and then filter by the actually needed + // words. + // TODO: improve size estimate by adding a correction factor. + size_t getSizeOfTextBlockForWord(const string& words) const; + size_t getSizeEstimate(const string& words) const; void callFixedGetContextListForWords(const string& words, @@ -335,15 +346,46 @@ class IndexImpl { Index::WordEntityPostings getContextEntityScoreListsForWords( const string& words) const; - Index::WordEntityPostings getWordPostingsForTerm(const string& term) const; + // Does the same as getWordPostingsForTerm but returns a + // WordEntityPosting. Sorted by textRecord. + Index::WordEntityPostings getWordPostingsForTermWep( + const string& wordOrPrefix) const; + + // Returns a set of [textRecord, term] pairs where the term is contained in + // the textRecord. The term can be either the wordOrPrefix itself or a word + // that has wordOrPrefix as a prefix. Returned IdTable has columns: + // textRecord, word. Sorted by textRecord. + IdTable getWordPostingsForTerm( + const string& wordOrPrefix, + const ad_utility::AllocatorWithLimit& allocator) const; Index::WordEntityPostings getEntityPostingsForTerm(const string& term) const; - Index::WordEntityPostings readWordCl(const TextBlockMetaData& tbmd) const; + // Returns a set of textRecords and their corresponding entities and + // scores. Each textRecord contains its corresponding entity and the term. + // Returned IdTable has columns: textRecord, entity, score. Sorted by + // textRecord. + // NOTE: This returns a superset because it contains the whole block and + // unfitting words are filtered out later by the join with the + // TextIndexScanForWords operation. + IdTable getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const; + + size_t getIndexOfBestSuitedElTerm(const vector& terms) const; + + Index::WordEntityPostings readWordClWep(const TextBlockMetaData& tbmd) const; + + IdTable readWordCl(const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const; - Index::WordEntityPostings readWordEntityCl( + Index::WordEntityPostings readWordEntityClWep( const TextBlockMetaData& tbmd) const; + IdTable readWordEntityCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const; + string getTextExcerpt(TextRecordIndex cid) const { if (cid.get() >= docsDB_._size) { return ""; @@ -550,8 +592,6 @@ class IndexImpl { size_t nofElements, off_t from, size_t nofBytes, MakeFromUint64t makeFromUint = MakeFromUint64t{}) const; - size_t getIndexOfBestSuitedElTerm(const vector& terms) const; - // Get the metadata for the block from the text index that contains the // `word`. Also works for prefixes that are terminated with `PREFIX_CHAR` like // "astro*". Returns `nullopt` if no suitable block was found because no diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index 414619e632..13699dfcaf 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -183,18 +183,21 @@ void Vocabulary::initializeInternalizedLangs(const StringRange& s) { // ___________________________________________________________________________ template -bool Vocabulary::getIdRangeForFullTextPrefix( - const string& word, IdRange* range) const { +std::optional> Vocabulary::getIdRangeForFullTextPrefix( + const string& word) const { AD_CONTRACT_CHECK(word[word.size() - 1] == PREFIX_CHAR); + IdRange range; auto prefixRange = prefix_range(word.substr(0, word.size() - 1)); bool success = prefixRange.second > prefixRange.first; - *range = IdRange{prefixRange.first, prefixRange.second.decremented()}; if (success) { - AD_CONTRACT_CHECK(range->first().get() < internalVocabulary_.size()); - AD_CONTRACT_CHECK(range->last().get() < internalVocabulary_.size()); + range = IdRange{prefixRange.first, prefixRange.second.decremented()}; + AD_CONTRACT_CHECK(range.first().get() < internalVocabulary_.size()); + AD_CONTRACT_CHECK(range.last().get() < internalVocabulary_.size()); + + return range; } - return success; + return std::nullopt; } // _______________________________________________________________ diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index d9f19a515f..6953c9b254 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -148,13 +148,13 @@ class Vocabulary { bool getId(const string& word, IndexType* idx) const; //! Get an Id range that matches a prefix. - //! Return value signals if something was found at all. + //! Return value also signals if something was found at all. //! CAVEAT! TODO: This is only used for the text index, //! and uses a range, where the last index is still within the range which is //! against C++ conventions! // consider using the prefixRange function. - bool getIdRangeForFullTextPrefix(const string& word, - IdRange* range) const; + std::optional> getIdRangeForFullTextPrefix( + const string& word) const; ad_utility::HashMap> getRangesForDatatypes() const; diff --git a/src/parser/data/Variable.h b/src/parser/data/Variable.h index ef35eb9f0c..7667cebdf3 100644 --- a/src/parser/data/Variable.h +++ b/src/parser/data/Variable.h @@ -7,6 +7,7 @@ #include #include #include +#include // Forward declaration because of cyclic dependencies // TODO The coupling of the `Variable` with its `evaluate` methods @@ -39,6 +40,15 @@ class Variable { // Convert `?someVariable` into `?ql_textscore_someVariable` Variable getTextScoreVariable() const; + // Converts `?someTextVar` and `?someEntityVar` into + // `?ql_someTextVar_score_var_someEntityVar`. + // Converts `?someTextVar` and `someFixedEntity` into + // `?ql_someTextVar_fixedEntity_someFixedEntity`. + // Note that if the the fixed entity contains non ascii characters they are + // converted to numbers and escaped. + Variable getScoreVariable( + const std::variant& varOrEntity) const; + // Convert `?someVariable` into `?ql_matchingword_someVariable_someTerm` Variable getMatchingWordVariable(std::string_view term) const; diff --git a/src/parser/data/VariableToColumnMapPrinters.cpp b/src/parser/data/VariableToColumnMapPrinters.cpp index fbf7808533..f23c0aeb23 100644 --- a/src/parser/data/VariableToColumnMapPrinters.cpp +++ b/src/parser/data/VariableToColumnMapPrinters.cpp @@ -15,7 +15,7 @@ Variable::Variable(std::string name) : _name{std::move(name)} { // verify variable name starts with ? or $ and continues without any // special characters. This is weaker than the SPARQL grammar, // but it is close enough so that it will likely never cause issues. - AD_CONTRACT_CHECK(ctre::match<"[$?]\\w+">(_name)); + AD_CONTRACT_CHECK(ctre::match<"[$?][\\w]+">(_name)); // normalize notation for consistency _name[0] = '?'; } @@ -58,6 +58,30 @@ Variable Variable::getTextScoreVariable() const { return Variable{absl::StrCat(TEXTSCORE_VARIABLE_PREFIX, name().substr(1))}; } +// _____________________________________________________________________________ +Variable Variable::getScoreVariable( + const std::variant& varOrEntity) const { + std::string_view type; + std::string entity; + if (std::holds_alternative(varOrEntity)) { + type = "_var_"; + entity = std::get(varOrEntity).name().substr(1); + } else { + type = "_fixedEntity_"; + // Converts input string to unambiguous result string not containing any + // special characters. "_" is used as an escaping character. + for (char c : std::get(varOrEntity)) { + if (isalpha(static_cast(c))) { + entity += c; + } else { + absl::StrAppend(&entity, "_", std::to_string(c), "_"); + } + } + } + return Variable{ + absl::StrCat(SCORE_VARIABLE_PREFIX, name().substr(1), type, entity)}; +} + // _____________________________________________________________________________ Variable Variable::getMatchingWordVariable(std::string_view term) const { return Variable{ diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index a129b34fbc..0e4b39638b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -1034,32 +1034,37 @@ vector Visitor::visit( // Similarly if a triple `?var ql:contains-word "words"` is contained in the // query, then the variable `ql_matchingword_var` is implicitly created and // visible in the query body. - auto setMatchingWordAndTextscoreVisibleIfPresent = [this, ctx]( - VarOrTerm& subject, - VarOrPath& predicate, - VarOrTerm& object) { - if (auto* var = std::get_if(&subject)) { - if (auto* propertyPath = std::get_if(&predicate)) { - if (propertyPath->asString() == CONTAINS_WORD_PREDICATE) { - addVisibleVariable(var->getTextScoreVariable()); - string name = object.toSparql(); - if (!((name.starts_with('"') && name.ends_with('"')) || - (name.starts_with('\'') && name.ends_with('\'')))) { - reportError( - ctx, - "ql:contains-word has to be followed by a string in quotes"); - } - for (std::string_view s : std::vector( - absl::StrSplit(name.substr(1, name.size() - 2), ' '))) { - if (!s.ends_with('*')) { - continue; - } - addVisibleVariable( - var->getMatchingWordVariable(s.substr(0, s.size() - 1))); - } - } else if (propertyPath->asString() == CONTAINS_ENTITY_PREDICATE) { - addVisibleVariable(var->getTextScoreVariable()); + auto setMatchingWordAndScoreVisibleIfPresent = [this, ctx]( + VarOrTerm& subject, + VarOrPath& predicate, + VarOrTerm& object) { + auto* var = std::get_if(&subject); + auto* propertyPath = std::get_if(&predicate); + + if (!var || !propertyPath) { + return; + } + + if (propertyPath->asString() == CONTAINS_WORD_PREDICATE) { + string name = object.toSparql(); + if (!((name.starts_with('"') && name.ends_with('"')) || + (name.starts_with('\'') && name.ends_with('\'')))) { + reportError( + ctx, "ql:contains-word has to be followed by a string in quotes"); + } + for (std::string_view s : std::vector( + absl::StrSplit(name.substr(1, name.size() - 2), ' '))) { + if (!s.ends_with('*')) { + continue; } + addVisibleVariable(var->getMatchingWordVariable( + ad_utility::utf8ToLower(s.substr(0, s.size() - 1)))); + } + } else if (propertyPath->asString() == CONTAINS_ENTITY_PREDICATE) { + if (const auto* entVar = std::get_if(&object)) { + addVisibleVariable(var->getScoreVariable(*entVar)); + } else if (const auto* fixedEntity = std::get_if(&object)) { + addVisibleVariable(var->getScoreVariable(fixedEntity->toSparql())); } } }; @@ -1069,7 +1074,7 @@ vector Visitor::visit( auto subject = visit(ctx->varOrTerm()); auto tuples = visit(ctx->propertyListPathNotEmpty()); for (auto& [predicate, object] : tuples) { - setMatchingWordAndTextscoreVisibleIfPresent(subject, predicate, object); + setMatchingWordAndScoreVisibleIfPresent(subject, predicate, object); triples.emplace_back(subject, std::move(predicate), std::move(object)); } return triples; diff --git a/test/FTSAlgorithmsTest.cpp b/test/FTSAlgorithmsTest.cpp index 0718441894..f80374a424 100644 --- a/test/FTSAlgorithmsTest.cpp +++ b/test/FTSAlgorithmsTest.cpp @@ -29,7 +29,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { Index::WordEntityPostings resultWep; // Empty - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); ASSERT_EQ(0u, resultWep.cids_.size()); // None @@ -37,7 +37,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.wids_ = {{2}}; wep.scores_ = {1}; - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); ASSERT_EQ(0u, resultWep.cids_.size()); // Match @@ -45,7 +45,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.wids_ = {{2, 5, 7, 5, 6}}; wep.scores_ = {1, 1, 1, 1, 1}; - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); EXPECT_THAT(resultWep.cids_, ::testing::ElementsAre(TRID(0), TRID(1), TRID(2), TRID(3))); EXPECT_THAT(resultWep.eids_, ::testing::ElementsAre()); @@ -57,7 +57,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.scores_ = {1, 1, 1, 1, 1, 1}; // Partial - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); EXPECT_THAT(resultWep.cids_, ::testing::ElementsAre(TRID(0), TRID(1), TRID(2), TRID(3))); EXPECT_THAT(resultWep.eids_, ::testing::ElementsAre()); diff --git a/test/IndexTestHelpers.h b/test/IndexTestHelpers.h index ad219cf523..45765556c8 100644 --- a/test/IndexTestHelpers.h +++ b/test/IndexTestHelpers.h @@ -43,7 +43,8 @@ Index makeTestIndex(const std::string& indexBasename, std::optional turtleInput = std::nullopt, bool loadAllPermutations = true, bool usePatterns = true, bool usePrefixCompression = true, - ad_utility::MemorySize blocksizePermutations = 16_B); + ad_utility::MemorySize blocksizePermutations = 16_B, + bool createTextIndex = false); // Return a static `QueryExecutionContext` that refers to an index that was // build using `makeTestIndex` (see above). The index (most notably its @@ -53,7 +54,8 @@ QueryExecutionContext* getQec( std::optional turtleInput = std::nullopt, bool loadAllPermutations = true, bool usePatterns = true, bool usePrefixCompression = true, - ad_utility::MemorySize blocksizePermutations = 16_B); + ad_utility::MemorySize blocksizePermutations = 16_B, + bool createTextIndex = false); // Return a lambda that takes a string and converts it into an ID by looking // it up in the vocabulary of `index`. An `AD_CONTRACT_CHECK` will fail if the diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index aec5d83b9d..fdb549a733 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -14,10 +14,6 @@ namespace h = queryPlannerTestHelpers; using Var = Variable; -namespace { -auto lit = ad_utility::testing::tripleComponentLiteral; -} - TEST(QueryPlannerTest, createTripleGraph) { using TripleGraph = QueryPlanner::TripleGraph; using Node = QueryPlanner::TripleGraph::Node; @@ -201,321 +197,6 @@ TEST(QueryPlannerTest, testBFSLeaveOut) { } } -TEST(QueryPlannerTest, testcollapseTextCliques) { - using TripleGraph = QueryPlanner::TripleGraph; - using Node = QueryPlanner::TripleGraph::Node; - using std::vector; - { - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\"}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - { - SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - }), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c " - " ?x. ?c " - " " - "\"abc\" . ?c " - "ql:contains-entity ?y}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?y " - "}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2, 4)\n" - "4 {s: ?y, p: , o: } : (3)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?y"}, "", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?c2 " - "ql:contains-entity ?y. ?c2 ql:contains-word \"xx\"}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, SparqlTriple(Var{"?x"}, "

", "")), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?c"}, - "", - Var{"?x"})), - {0, 2, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?c"}, - "", - lit("\"abc\""))), - {1, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 3, SparqlTriple(Var{"?c"}, - "", - Var{"?y"})), - {1, 2, 4}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 4, SparqlTriple(Var{"?c2"}, - "", - Var{"?y"})), - {3, 5}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 5, SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))), - {4})})); - - ASSERT_TRUE(tg.isSimilar(expected)); - tg.collapseTextCliques(); - TripleGraph expected2 = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, Var{"?c2"}, {"xx"}, - {SparqlTriple(Var{"?c2"}, - "", - Var{"?y"}), - SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))}), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected2)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?c2 " - "ql:contains-entity ?y. ?c2 ql:contains-word \"xx\". ?y " - "}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2, 4, 6)\n" - "4 {s: ?c2, p: " - ", o: ?y} " - ": (3, 5, 6)\n" - "5 {s: ?c2, p: " - ", " - "o: \"xx\"} " - ": " - "(4)\n" - "6 {s: ?y, p: , o: } : (3, 4)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected2 = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - "abc"), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, Var{"?c2"}, {"xx"}, - {SparqlTriple(Var{"?c2"}, - "", - Var{"?y"}), - SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))}), - {0, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?x"}, "

", "")), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 3, SparqlTriple(Var{"?y"}, "", "")), - {0, 1})})); - ASSERT_TRUE(tg.isSimilar(expected2)); - } - } -} - TEST(QueryPlannerTest, indexScanOneVariable) { auto scan = h::IndexScanFromStrings; using enum Permutation::Enum; @@ -698,96 +379,40 @@ TEST(QueryExecutionTreeTest, testBooksGermanAwardNomAuth) { scan("?x", "", "?y"), scan("?y", "", ""))); } -/* TEST(QueryExecutionTreeTest, testPlantsEdibleLeaves) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?a \n " - "WHERE {?a . ?c ql:contains-entity ?a. " - "?c ql:contains-word \"edible leaves\"} TEXTLIMIT 5"); - QueryPlanner qp(nullptr); - QueryPlanner::TripleGraph tg = - qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ(1u, tg._nodeMap.find(0)->second->_variables.size()); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"edible leaves\" and 1 variables with textLimit = 5 " - "filtered by\n {\n SCAN POS with P = \"\", " - "O = \"\"\n qet-width: 1 \n }\n filtered on " - "column 0\n qet-width: 3 \n}", - qet.getCacheKey()); -} - -TEST(QueryExecutionTreeTest, testTextQuerySE) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?c \n " - "WHERE {?c ql:contains-word \"search engine\"}"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(absl::StrCat( - "{\n TEXT OPERATION WITHOUT FILTER: co-occurrence with words:", - " \"search engine\" and 0 variables with textLimit = ", - TEXT_LIMIT_DEFAULT, "\n", " qet-width: 2 \n}"), - qet.getCacheKey()); -} - -TEST(QueryExecutionTreeTest, testBornInEuropeOwCocaine) { - ParsedQuery pq = SparqlParser::parseQuery( - "PREFIX : <>\n" - "SELECT ?x ?y ?c\n " - "WHERE \t {" - "?x :Place_of_birth ?y ." - "?y :Contained_by :Europe ." - "?c ql:contains-entity ?x ." - "?c ql:contains-word \"cocaine\" ." - "} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"cocaine\" and 1 variables with textLimit = 1 filtered by\n " - "{\n JOIN\n {\n SCAN POS with P = \"\", " - "O = \"\"\n qet-width: 1 \n } join-column: [0]\n" - " |X|\n {\n SCAN POS with P = \"\"\n" - " qet-width: 2 \n } join-column: [0]\n qet-width: 2 \n" - " }\n filtered on column 1\n qet-width: 4 \n}", - qet.getCacheKey()); - auto c = Variable{"?c"}; - ASSERT_EQ(0u, qet.getVariableColumn(c)); - ASSERT_EQ(1u, qet.getVariableColumn(c.getTextScoreVariable())); - ASSERT_EQ(2u, qet.getVariableColumn(Variable{"?y"})); + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT ?a WHERE {?a . ?c ql:contains-entity ?a. ?c " + "ql:contains-word \"edible leaves\"}", + h::UnorderedJoins(scan("?a", "", ""), + wordScan(Var{"?c"}, "edible"), + wordScan(Var{"?c"}, "leaves"), + entityScan(Var{"?c"}, Var{"?a"}, "edible"))); } TEST(QueryExecutionTreeTest, testCoOccFreeVar) { - ParsedQuery pq = SparqlParser::parseQuery( - "PREFIX : <>" - "SELECT ?x ?y WHERE {" - "?x :is-a :Politician ." - "?c ql:contains-entity ?x ." - "?c ql:contains-word \"friend*\" ." - "?c ql:contains-entity ?y ." - "} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"friend*\" and 2 variables with textLimit = 1 filtered by\n" - " {\n SCAN POS with P = \"\", O = \"" - "\"\n qet-width: 1 \n }\n filtered on column 0\n " - " qet-width: 5 \n}", - qet.getCacheKey()); - auto c = Variable{"?c"}; - ASSERT_EQ(0u, qet.getVariableColumn(c)); - ASSERT_EQ(1u, qet.getVariableColumn(c.getTextScoreVariable())); - ASSERT_EQ(2u, qet.getVariableColumn(Variable{"?y"})); - ASSERT_EQ(3u, qet.getVariableColumn(Variable{"?x"})); - ASSERT_EQ(4u, qet.getVariableColumn(c.getMatchingWordVariable("friend"))); + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "PREFIX : <> SELECT ?x ?y WHERE { ?x :is-a :Politician . ?c " + "ql:contains-entity ?x . ?c ql:contains-word \"friend*\" . ?c " + "ql:contains-entity ?y }", + h::UnorderedJoins(scan("?x", "", ""), + entityScan(Var{"?c"}, Var{"?x"}, "friend*"), + wordScan(Var{"?c"}, "friend*"), + entityScan(Var{"?c"}, Var{"?y"}, "friend*"))); } TEST(QueryExecutionTreeTest, testPoliticiansFriendWithScieManHatProj) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?p ?s \n " + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT ?p ?s" "WHERE {" "?a . " "?c ql:contains-entity ?a ." @@ -795,24 +420,16 @@ TEST(QueryExecutionTreeTest, testPoliticiansFriendWithScieManHatProj) { "?c ql:contains-entity ?s ." "?s ." "?c2 ql:contains-entity ?s ." - "?c2 ql:contains-word \"manhattan project\"} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: \"manhattan " - "project\" and 1 variables with textLimit = 1 filtered by\n {\n " - "JOIN\n {\n SORT(internal) on columns:asc(2) \n {\n " - "TEXT OPERATION WITH FILTER: co-occurrence with words: \"friend*\" and 2 " - "variables with textLimit = 1 filtered by\n {\n SCAN POS " - "with P = \"\", O = \"\"\n qet-width: 1 \n " - " }\n filtered on column 0\n qet-width: 5 \n }\n " - " qet-width: 5 \n } join-column: [2]\n |X|\n {\n SCAN " - "POS with P = \"\", O = \"\"\n qet-width: 1 \n " - "} join-column: [0]\n qet-width: 5 \n }\n filtered on column 2\n " - "qet-width: 7 \n}", - qet.getCacheKey()); + "?c2 ql:contains-word \"manhattan project\"}", + h::UnorderedJoins(scan("?a", "", ""), + entityScan(Var{"?c"}, Var{"?a"}, "friend*"), + wordScan(Var{"?c"}, "friend*"), + entityScan(Var{"?c"}, Var{"?s"}, "friend*"), + scan("?s", "", ""), + entityScan(Var{"?c2"}, Var{"?s"}, "manhattan"), + wordScan(Var{"?c2"}, "manhattan"), + wordScan(Var{"?c2"}, "project"))); } - */ TEST(QueryExecutionTreeTest, testCyclicQuery) { ParsedQuery pq = SparqlParser::parseQuery( @@ -1158,3 +775,103 @@ TEST(QueryPlanner, BindAtBeginningOfQuery) { " BIND (3 + 5 AS ?x) }", h::Bind(h::NeutralElementOperation(), "3 + 5", Variable{"?x"})); } + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TextIndexScanForWord) { + auto qec = ad_utility::testing::getQec( + "

\"this text contains some words and is part of the test\" . " + "

\"testEntity\" .

\"picking the right text can be a hard " + "test\" .

\"sentence for multiple words tests\" . " + "

\"testing and picking\"", + true, true, true, 16_B, true); + auto wordScan = h::TextIndexScanForWord; + + h::expect("SELECT * WHERE { ?text ql:contains-word \"test*\" }", + wordScan(Var{"?text"}, "test*"), qec); + + h::expect("SELECT * WHERE { ?text2 ql:contains-word \"test\" }", + wordScan(Var{"?text2"}, "test"), qec); + + h::expect( + "SELECT * WHERE { ?text2 ql:contains-word \"multiple words* test\" }", + h::UnorderedJoins(wordScan(Var{"?text2"}, "test"), + wordScan(Var{"?text2"}, "words*"), + wordScan(Var{"?text2"}, "multiple")), + qec); + + AD_EXPECT_THROW_WITH_MESSAGE( + SparqlParser::parseQuery( + "SELECT * WHERE { ?text ql:contains-word . }"), + ::testing::ContainsRegex( + "ql:contains-word has to be followed by a string in quotes")); +} + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TextIndexScanForEntity) { + auto qec = ad_utility::testing::getQec( + "

\"this text contains some words and is part of the test\" . " + "

.

\"picking the right text can be a hard " + "test\" .

\"only this text contains the word opti \" . " + "

\"testing and picking\"", + true, true, true, 16_B, true); + + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT * WHERE { ?text ql:contains-entity ?scientist . ?text " + "ql:contains-word \"test*\" }", + h::Join(wordScan(Var{"?text"}, "test*"), + entityScan(Var{"?text"}, Var{"?scientist"}, "test*")), + qec); + + h::expect( + "SELECT * WHERE { ?text ql:contains-entity . ?text " + "ql:contains-word \"test\" }", + h::Join(wordScan(Var{"?text"}, "test"), + entityScan(Var{"?text"}, "", "test")), + qec); + + // Test case sensitivity + h::expect( + "SELECT * WHERE { ?text ql:contains-entity . ?text " + "ql:contains-word \"TeST\" }", + h::Join(wordScan(Var{"?text"}, "test"), + entityScan(Var{"?text"}, "", "test")), + qec); + + // NOTE: It is important that the TextIndexScanForEntity uses "opti", because + // we also want to test here if the QueryPlanner assigns the optimal word to + // the Operation. + h::expect( + "SELECT * WHERE { ?text ql:contains-word \"picking*\" . ?text " + "ql:contains-entity . ?text ql:contains-word " + "\"opti\" . ?text ql:contains-word \"testi*\"}", + h::UnorderedJoins(entityScan(Var{"?text"}, "", "opti"), + wordScan(Var{"?text"}, "testi*"), + wordScan(Var{"?text"}, "opti"), + wordScan(Var{"?text"}, "picking*")), + qec); + + ParsedQuery pq = SparqlParser::parseQuery( + "SELECT * WHERE { ?text ql:contains-entity ?scientist . }"); + QueryPlanner qp(nullptr); + AD_EXPECT_THROW_WITH_MESSAGE( + qp.createExecutionTree(pq), + ::testing::ContainsRegex( + "Missing ql:contains-word statement. A ql:contains-entity statement " + "always also needs corresponding ql:contains-word statement.")); +} + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TooManyTriples) { + std::string query = "SELECT * WHERE {"; + for (size_t i = 0; i < 65; i++) { + query = absl::StrCat(query, " ?x

?y ."); + } + query = absl::StrCat(query, "}"); + ParsedQuery pq = SparqlParser::parseQuery(query); + QueryPlanner qp(nullptr); + AD_EXPECT_THROW_WITH_MESSAGE( + qp.createExecutionTree(pq), + ::testing::ContainsRegex("At most 64 triples allowed at the moment.")); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index 51cc962d7b..0b785e69e7 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -15,6 +15,8 @@ #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" #include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" #include "engine/TransitivePath.h" #include "gmock/gmock-matchers.h" #include "gmock/gmock.h" @@ -86,6 +88,41 @@ inline auto IndexScan = AD_PROPERTY(IndexScan, getObject, Eq(object)))); }; +inline auto TextIndexScanForWord = [](Variable textRecordVar, + string word) -> QetMatcher { + return RootOperation<::TextIndexScanForWord>(AllOf( + AD_PROPERTY(::TextIndexScanForWord, getResultWidth, + Eq(1 + word.ends_with('*'))), + AD_PROPERTY(::TextIndexScanForWord, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForWord, word, word))); +}; + +inline auto TextIndexScanForEntity = + [](Variable textRecordVar, std::variant entity, + string word) -> QetMatcher { + // TODO: Implement AD_THROWING_PROPERTY(..., Exception matcher) and use it + // here to test the contract-checks in entityVariable() and fixedEntity(). + if (std::holds_alternative(entity)) { + return RootOperation<::TextIndexScanForEntity>(AllOf( + AD_PROPERTY(::TextIndexScanForEntity, getResultWidth, + Eq(2 + std::holds_alternative(entity))), + AD_PROPERTY(::TextIndexScanForEntity, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForEntity, entityVariable, + std::get(entity)), + AD_PROPERTY(::TextIndexScanForEntity, word, word), + AD_PROPERTY(::TextIndexScanForEntity, hasFixedEntity, false))); + } else { + return RootOperation<::TextIndexScanForEntity>(AllOf( + AD_PROPERTY(::TextIndexScanForEntity, getResultWidth, + Eq(2 + std::holds_alternative(entity))), + AD_PROPERTY(::TextIndexScanForEntity, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForEntity, fixedEntity, + std::get(entity)), + AD_PROPERTY(::TextIndexScanForEntity, word, word), + AD_PROPERTY(::TextIndexScanForEntity, hasFixedEntity, true))); + } +}; + inline auto Bind = [](const QetMatcher& childMatcher, std::string_view expression, Variable target) -> QetMatcher { diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp index 22f130e455..f15e444e21 100644 --- a/test/SparqlParserTest.cpp +++ b/test/SparqlParserTest.cpp @@ -841,22 +841,22 @@ TEST(ParserTest, testSolutionModifiers) { { auto pq = SparqlParser::parseQuery( - "SELECT DISTINCT ?x ?ql_textscore_x ?y WHERE \t {?x " + "SELECT DISTINCT ?x ?ql_score_x_var_y ?y WHERE \t {?x " "ql:contains-entity ?y}\n" - "ORDER BY ASC(?y) DESC(?ql_textscore_x) LIMIT 10 OFFSET 15"); + "ORDER BY ASC(?y) DESC(?ql_score_x_var_y) LIMIT 10 OFFSET 15"); ASSERT_TRUE(pq.hasSelectClause()); const auto& selectClause = pq.selectClause(); ASSERT_EQ(1u, pq.children().size()); const auto& c = pq.children()[0].getBasic(); ASSERT_EQ(3u, selectClause.getSelectedVariables().size()); - ASSERT_EQ(Var{"?ql_textscore_x"}, selectClause.getSelectedVariables()[1]); + ASSERT_EQ(Var{"?ql_score_x_var_y"}, selectClause.getSelectedVariables()[1]); ASSERT_EQ(1u, c._triples.size()); ASSERT_EQ(10u, pq._limitOffset._limit); ASSERT_EQ(15u, pq._limitOffset._offset); ASSERT_EQ(size_t(2), pq._orderBy.size()); ASSERT_EQ(Var{"?y"}, pq._orderBy[0].variable_); ASSERT_FALSE(pq._orderBy[0].isDescending_); - ASSERT_EQ(Var{"?ql_textscore_x"}, pq._orderBy[1].variable_); + ASSERT_EQ(Var{"?ql_score_x_var_y"}, pq._orderBy[1].variable_); ASSERT_TRUE(pq._orderBy[1].isDescending_); ASSERT_TRUE(selectClause.distinct_); ASSERT_FALSE(selectClause.reduced_); diff --git a/test/VocabularyTest.cpp b/test/VocabularyTest.cpp index c00693d6a1..34c69e01f4 100644 --- a/test/VocabularyTest.cpp +++ b/test/VocabularyTest.cpp @@ -48,28 +48,32 @@ TEST(VocabularyTest, getIdRangeForFullTextPrefixTest) { v.createFromSet(s); uint64_t word0 = 0; - IdRange retVal; // Match exactly one - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordA1*", &retVal)); - ASSERT_EQ(word0 + 1, retVal.first().get()); - ASSERT_EQ(word0 + 1, retVal.last().get()); + auto retVal = v.getIdRangeForFullTextPrefix("wordA1*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0 + 1, retVal.value().first().get()); + ASSERT_EQ(word0 + 1, retVal.value().last().get()); // Match all - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("word*", &retVal)); - ASSERT_EQ(word0, retVal.first().get()); - ASSERT_EQ(word0 + 4, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("word*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0, retVal.value().first().get()); + ASSERT_EQ(word0 + 4, retVal.value().last().get()); // Match first two - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordA*", &retVal)); - ASSERT_EQ(word0, retVal.first().get()); - ASSERT_EQ(word0 + 1, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("wordA*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0, retVal.value().first().get()); + ASSERT_EQ(word0 + 1, retVal.value().last().get()); // Match last three - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordB*", &retVal)); - ASSERT_EQ(word0 + 2, retVal.first().get()); - ASSERT_EQ(word0 + 4, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("wordB*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0 + 2, retVal.value().first().get()); + ASSERT_EQ(word0 + 4, retVal.value().last().get()); - ASSERT_FALSE(v.getIdRangeForFullTextPrefix("foo*", &retVal)); + retVal = v.getIdRangeForFullTextPrefix("foo*"); + ASSERT_FALSE(retVal.has_value()); } TEST(VocabularyTest, readWriteTest) { diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index 2a9dfabac0..62a36c12ec 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -1,3 +1,5 @@ add_subdirectory(idTable) addLinkAndDiscoverTest(IndexScanTest engine) addLinkAndDiscoverTest(CartesianProductJoinTest engine) +addLinkAndDiscoverTest(TextIndexScanForWordTest engine) +addLinkAndDiscoverTest(TextIndexScanForEntityTest engine) diff --git a/test/engine/TextIndexScanForEntityTest.cpp b/test/engine/TextIndexScanForEntityTest.cpp new file mode 100644 index 0000000000..9ab86cb8d7 --- /dev/null +++ b/test/engine/TextIndexScanForEntityTest.cpp @@ -0,0 +1,155 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include + +#include "../IndexTestHelpers.h" +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "./TextIndexScanTestHelpers.h" +#include "engine/IndexScan.h" +#include "engine/TextIndexScanForEntity.h" +#include "parser/ParsedQuery.h" + +using namespace ad_utility::testing; +using ad_utility::source_location; +namespace h = textIndexScanTestHelpers; + +namespace { +std::string kg = + "

\"he failed the test\" .

\"testing can help\" .

" + "\"some other sentence\" .

\"the test on friday was really hard\" " + ". . ."; + +TEST(TextIndexScanForEntity, EntityScanBasic) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + TextIndexScanForEntity s2{qec, Variable{"?text2"}, Variable{"?entityVar2"}, + "test*"}; + ASSERT_EQ(s1.getResultWidth(), 3); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 3); + ASSERT_EQ(result.size(), 3); + + // NOTE: because of the way the graph above is constructed, the entities are + // texts + ASSERT_EQ("\"he failed the test\"", + h::getEntityFromResultTable(qec, result, 0)); + ASSERT_EQ("\"testing can help\"", + h::getEntityFromResultTable(qec, result, 1)); + ASSERT_EQ("\"the test on friday was really hard\"", + h::getEntityFromResultTable(qec, result, 2)); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables{ + {Variable{"?text2"}, {0, AlwaysDefined}}, + {Variable{"?entityVar2"}, {1, AlwaysDefined}}, + {Variable{"?ql_score_text2_var_entityVar2"}, {2, AlwaysDefined}}}; + EXPECT_THAT(s2.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); +} + +TEST(TextIndexScanForEntity, FixedEntityScan) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + string fixedEntity = "\"some other sentence\""; + TextIndexScanForEntity s3{qec, Variable{"?text3"}, fixedEntity, "sentence"}; + + auto result = s3.computeResultOnlyForTesting(); + ASSERT_EQ(s3.getResultWidth(), 2); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 1); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables = { + {Variable{"?text3"}, {0, AlwaysDefined}}, + {Variable{ + "?ql_score_text3_fixedEntity__34_some_32_other_32_sentence_34_"}, + {1, AlwaysDefined}}}; + EXPECT_THAT(s3.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); + + ASSERT_EQ(fixedEntity, h::getTextRecordFromResultTable(qec, result, 0)); + + fixedEntity = "\"he failed the test\""; + TextIndexScanForEntity s4{qec, Variable{"?text4"}, fixedEntity, "test*"}; + result = s4.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 1); + + ASSERT_EQ(fixedEntity, h::getTextRecordFromResultTable(qec, result, 0)); +} + +TEST(TextIndexScanForEntity, CacheKeys) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + TextIndexScanForEntity s2{qec, Variable{"?text2"}, Variable{"?entityVar2"}, + "test*"}; + // Different text vars, different entity vars, same word (both with prefix) + ASSERT_EQ(s1.getCacheKeyImpl(), s2.getCacheKeyImpl()); + + TextIndexScanForEntity s3{qec, Variable{"?text3"}, Variable{"?entityVar"}, + "test"}; + // Different text vars, same entity var, different words (one with, one + // without prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s3.getCacheKeyImpl()); + + TextIndexScanForEntity s4{qec, Variable{"?text4"}, Variable{"?entityVar"}, + "sentence*"}; + // Different text vars, same entity var, different words (both with prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s4.getCacheKeyImpl()); + + // fixed entity case + string fixedEntity = "\"some other sentence\""; + TextIndexScanForEntity s5{qec, Variable{"?text3"}, fixedEntity, "sentence"}; + // Same text var, different entities (one entity var, one fixed entity), same + // word + ASSERT_NE(s3.getCacheKeyImpl(), s5.getCacheKeyImpl()); + + TextIndexScanForEntity s6{qec, Variable{"?text6"}, fixedEntity, "sentence"}; + // Different text vars, same fixed entity, same word + ASSERT_EQ(s5.getCacheKeyImpl(), s6.getCacheKeyImpl()); + + string newFixedEntity = "\"he failed the test\""; + TextIndexScanForEntity s7{qec, Variable{"?text7"}, newFixedEntity, + "sentence"}; + // Different text vars, different fixed entities, same word + ASSERT_NE(s5.getCacheKeyImpl(), s7.getCacheKeyImpl()); + + TextIndexScanForEntity s8{qec, Variable{"?text7"}, newFixedEntity, + "sentences"}; + // Same text var, same fixed entitiy, different words + ASSERT_NE(s7.getCacheKeyImpl(), s8.getCacheKeyImpl()); +} + +TEST(TextIndexScanForEntity, KnownEmpty) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "nonExistentWord*"}; + ASSERT_TRUE(s1.knownEmptyResult()); + + string fixedEntity = "\"non existent entity\""; + AD_EXPECT_THROW_WITH_MESSAGE( + TextIndexScanForEntity(qec, Variable{"?text"}, fixedEntity, "test*"), + ::testing::ContainsRegex(absl::StrCat( + "The entity ", fixedEntity, + " is not part of the underlying knowledge graph and can therefore " + "not be used as the object of ql:contains-entity"))); + + TextIndexScanForEntity s2{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + ASSERT_TRUE(!s2.knownEmptyResult()); + + TextIndexScanForEntity s3{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test"}; + ASSERT_TRUE(!s3.knownEmptyResult()); +} + +} // namespace diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp new file mode 100644 index 0000000000..5f1741d955 --- /dev/null +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -0,0 +1,127 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include +#include + +#include "../IndexTestHelpers.h" +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "./TextIndexScanTestHelpers.h" +#include "engine/IndexScan.h" +#include "engine/TextIndexScanForWord.h" +#include "parser/ParsedQuery.h" + +using namespace ad_utility::testing; +using ad_utility::source_location; +namespace h = textIndexScanTestHelpers; + +namespace { +std::string kg = + "

\"he failed the test\" .

\"testing can help\" .

" + "\"some other sentence\" .

\"the test on friday was really hard\" " + ". . ."; + +TEST(TextIndexScanForWord, WordScanPrefix) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test*"}; + TextIndexScanForWord s2{qec, Variable{"?text2"}, "test*"}; + + ASSERT_EQ(s1.getResultWidth(), 2); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 3); + s2.getExternallyVisibleVariableColumns(); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables{ + {Variable{"?text2"}, {0, AlwaysDefined}}, + {Variable{"?ql_matchingword_text2_test"}, {1, AlwaysDefined}}}; + EXPECT_THAT(s2.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); + + ASSERT_EQ(h::combineToString("\"he failed the test\"", "test"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 0), + h::getWordFromResultTable(qec, result, 0))); + ASSERT_EQ(h::combineToString("\"testing can help\"", "testing"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 1), + h::getWordFromResultTable(qec, result, 1))); + ASSERT_EQ( + h::combineToString("\"the test on friday was really hard\"", "test"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 2), + h::getWordFromResultTable(qec, result, 2))); +} + +TEST(TextIndexScanForWord, WordScanBasic) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test"}; + + ASSERT_EQ(s1.getResultWidth(), 1); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 1); + ASSERT_EQ(result.size(), 2); + + ASSERT_EQ("\"he failed the test\"", + h::getTextRecordFromResultTable(qec, result, 0)); + ASSERT_EQ("\"the test on friday was really hard\"", + h::getTextRecordFromResultTable(qec, result, 1)); + + TextIndexScanForWord s2{qec, Variable{"?text1"}, "testing"}; + + ASSERT_EQ(s2.getResultWidth(), 1); + + result = s2.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 1); + ASSERT_EQ(result.size(), 1); + + ASSERT_EQ("\"testing can help\"", + h::getTextRecordFromResultTable(qec, result, 0)); +} + +TEST(TextIndexScanForWord, CacheKey) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test*"}; + TextIndexScanForWord s2{qec, Variable{"?text2"}, "test*"}; + // Different text variables, same word (both with prefix) + ASSERT_EQ(s1.getCacheKeyImpl(), s2.getCacheKeyImpl()); + + TextIndexScanForWord s3{qec, Variable{"?text1"}, "test"}; + // Same text variable, different words (one with, one without prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s3.getCacheKeyImpl()); + + TextIndexScanForWord s4{qec, Variable{"?text1"}, "tests"}; + // Same text variable, different words (both without prefix) + ASSERT_NE(s3.getCacheKeyImpl(), s4.getCacheKeyImpl()); + + TextIndexScanForWord s5{qec, Variable{"?text2"}, "tests"}; + // Different text variables, different words (both without prefix) + ASSERT_NE(s3.getCacheKeyImpl(), s5.getCacheKeyImpl()); + // Different text variables, same words (both without prefix) + ASSERT_EQ(s4.getCacheKeyImpl(), s5.getCacheKeyImpl()); +} + +TEST(TextIndexScanForWord, KnownEmpty) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "nonExistentWord*"}; + ASSERT_TRUE(s1.knownEmptyResult()); + + TextIndexScanForWord s2{qec, Variable{"?text1"}, "nonExistentWord"}; + ASSERT_TRUE(s2.knownEmptyResult()); + + TextIndexScanForWord s3{qec, Variable{"?text1"}, "test"}; + ASSERT_TRUE(!s3.knownEmptyResult()); + + TextIndexScanForWord s4{qec, Variable{"?text1"}, "test*"}; + ASSERT_TRUE(!s4.knownEmptyResult()); + + TextIndexScanForWord s5{qec, Variable{"?text1"}, "testing"}; + ASSERT_TRUE(!s5.knownEmptyResult()); +} +} // namespace diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h new file mode 100644 index 0000000000..25ff7f3aaf --- /dev/null +++ b/test/engine/TextIndexScanTestHelpers.h @@ -0,0 +1,43 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +namespace textIndexScanTestHelpers { +// NOTE: this function exploits a "lucky accident" that allows us to +// obtain the textRecord using idToOptionalString. +// TODO: Implement a more elegant/stable version +inline string getTextRecordFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(0)[rowIndex].getVocabIndex()) + .value(); +} + +inline string getEntityFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(1)[rowIndex].getVocabIndex()) + .value(); +} + +inline string getWordFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(1)[rowIndex].getWordVocabIndex()) + .value(); +} + +inline string combineToString(const string& text, const string& word) { + std::stringstream ss; + ss << "Text: " << text << ", Word: " << word << std::endl; + return ss.str(); +} +} // namespace textIndexScanTestHelpers diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 44c69e1864..4bff3a5137 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -123,7 +123,8 @@ Index makeTestIndex(const std::string& indexBasename, std::optional turtleInput, bool loadAllPermutations, bool usePatterns, bool usePrefixCompression, - ad_utility::MemorySize blocksizePermutations) { + ad_utility::MemorySize blocksizePermutations, + bool createTextIndex) { // Ignore the (irrelevant) log output of the index building and loading during // these tests. static std::ostringstream ignoreLogStream; @@ -155,6 +156,9 @@ Index makeTestIndex(const std::string& indexBasename, index.setPrefixCompression(usePrefixCompression); index.loadAllPermutations() = loadAllPermutations; index.createFromFile(inputFilename); + if (createTextIndex) { + index.addTextFromContextFile("", true); + } } if (!usePatterns || !loadAllPermutations) { // If we have no patterns, or only two permutations, then check the graceful @@ -172,6 +176,9 @@ Index makeTestIndex(const std::string& indexBasename, index.usePatterns() = usePatterns; index.loadAllPermutations() = loadAllPermutations; index.createFromOnDiskIndex(indexBasename); + if (createTextIndex) { + index.addTextFromOnDiskIndex(); + } ad_utility::setGlobalLoggingStream(&std::cout); if (usePatterns && loadAllPermutations) { @@ -184,7 +191,8 @@ Index makeTestIndex(const std::string& indexBasename, QueryExecutionContext* getQec(std::optional turtleInput, bool loadAllPermutations, bool usePatterns, bool usePrefixCompression, - ad_utility::MemorySize blocksizePermutations) { + ad_utility::MemorySize blocksizePermutations, + bool createTextIndex) { // Similar to `absl::Cleanup`. Calls the `callback_` in the destructor, but // the callback is stored as a `std::function`, which allows to store // different types of callbacks in the same wrapper type. @@ -230,20 +238,20 @@ QueryExecutionContext* getQec(std::optional turtleInput, std::string testIndexBasename = "_staticGlobalTestIndex" + std::to_string(contextMap.size()); contextMap.emplace( - key, - Context{TypeErasedCleanup{[testIndexBasename]() { - for (const std::string& indexFilename : - getAllIndexFilenames(testIndexBasename)) { - // Don't log when a file can't be deleted, - // because the logging might already be - // destroyed. - ad_utility::deleteFile(indexFilename, false); - } - }}, - std::make_unique(makeTestIndex( - testIndexBasename, turtleInput, loadAllPermutations, - usePatterns, usePrefixCompression, blocksizePermutations)), - std::make_unique()}); + key, Context{TypeErasedCleanup{[testIndexBasename]() { + for (const std::string& indexFilename : + getAllIndexFilenames(testIndexBasename)) { + // Don't log when a file can't be deleted, + // because the logging might already be + // destroyed. + ad_utility::deleteFile(indexFilename, false); + } + }}, + std::make_unique(makeTestIndex( + testIndexBasename, turtleInput, loadAllPermutations, + usePatterns, usePrefixCompression, + blocksizePermutations, createTextIndex)), + std::make_unique()}); } return contextMap.at(key).qec_.get(); }