Skip to content

Commit

Permalink
ql:contains-word now can show the score of the word match in the re…
Browse files Browse the repository at this point in the history
…spective text (#1397)

The fulltext index of QLever has forever been able to associate the occurence of a word in a text with a score. 
This PR adds the functionality to actually retrieve this score and to use it in the remainder of the query.
Currently the score is bound to a variable the name of which is automatically determined from the involved literals and variables. The easiest way to get the names of these variables is to use `SELECT *` or to look at the runtime information tree.
  • Loading branch information
Flixtastic authored Dec 16, 2024
1 parent 27f4799 commit a97905e
Show file tree
Hide file tree
Showing 20 changed files with 400 additions and 98 deletions.
33 changes: 24 additions & 9 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,31 +55,43 @@ queries:
?t ql:contains-word "RElaT* phySIKalische rela*"
}
checks:
- num_cols: 5
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
- num_cols: 8
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_score_prefix_t_RElaT", "?ql_matchingword_t_relat", "?ql_score_word_t_phySIKalische", "?ql_score_prefix_t_rela", "?ql_matchingword_t_rela" ]
- contains_row:
- "<Albert_Einstein>"
- null
- null
- null
- "relationship"
- null
- null
- "relationship"
- contains_row:
- "<Albert_Einstein>"
- null
- null
- null
- "relationship"
- null
- null
- "relativity"
- contains_row:
- "<Albert_Einstein>"
- null
- null
- null
- "relativity"
- null
- null
- "relationship"
- contains_row:
- "<Albert_Einstein>"
- null
- null
- null
- "relativity"
- null
- null
- "relativity"

- query: algo-star-female-scientists
Expand Down Expand Up @@ -151,7 +163,7 @@ queries:
}
TEXTLIMIT 2
checks:
- num_cols: 7
- num_cols: 9
- num_rows: 18

- query: algor-star-female-born-before-1940
Expand Down Expand Up @@ -192,7 +204,7 @@ queries:
}
ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
checks:
- num_cols: 5
- num_cols: 6
- num_rows: 7
- contains_row:
- "<Ada_Lovelace>"
Expand All @@ -202,6 +214,7 @@ queries:
Charles Babbage, also known as' the father of computers', and in
particular, Babbage's work on the Analytical Engine."
- null
- null
- "relationship"
- order_numeric: {"dir": "DESC",
"var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}
Expand All @@ -219,7 +232,7 @@ queries:
ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
TEXTLIMIT 2
checks:
- num_cols: 5
- num_cols: 6
- num_rows: 3
- contains_row:
- "<Ada_Lovelace>"
Expand All @@ -229,6 +242,7 @@ queries:
Charles Babbage, also known as' the father of computers', and in
particular, Babbage's work on the Analytical Engine."
- null
- null
- "relationship"
- order_numeric: {"dir": "DESC",
"var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}
Expand All @@ -246,7 +260,7 @@ queries:
}
TEXTLIMIT 1
checks:
- num_cols: 6
- num_cols: 7
- num_rows: 2
- contains_row:
- "<Ada_Lovelace>"
Expand All @@ -255,6 +269,7 @@ queries:
with Somerville to visit Babbage as often as she could."
- null
- null
- null
- "relationship"


Expand Down Expand Up @@ -1391,10 +1406,10 @@ queries:
?t ql:contains-word "algo* herm* primary"
}
checks:
- num_cols: 5
- num_cols: 8
- num_rows: 1
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
- contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ]
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_score_prefix_t_algo", "?ql_matchingword_t_algo", "?ql_score_prefix_t_herm", "?ql_matchingword_t_herm", "?ql_score_word_t_primary" ]
- contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.",null,"algorithm",null,"hermann",null ]


- query : select_asterisk_regex-lastname-stein
Expand Down
4 changes: 2 additions & 2 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1002,14 +1002,14 @@ QueryPlanner::SubtreePlan QueryPlanner::getTextLeafPlan(
: *(node._variables.begin());
plan = makeSubtreePlan<TextIndexScanForEntity>(_qec, cvar, evar, word);
textLimits[cvar].entityVars_.push_back(evar);
textLimits[cvar].scoreVars_.push_back(cvar.getScoreVariable(evar));
textLimits[cvar].scoreVars_.push_back(cvar.getEntityScoreVariable(evar));
} else {
// Fixed entity case
AD_CORRECTNESS_CHECK(node._variables.size() == 1);
plan = makeSubtreePlan<TextIndexScanForEntity>(
_qec, cvar, node.triple_.o_.toString(), word);
textLimits[cvar].scoreVars_.push_back(
cvar.getScoreVariable(node.triple_.o_.toString()));
cvar.getEntityScoreVariable(node.triple_.o_.toString()));
}
} else {
plan = makeSubtreePlan<TextIndexScanForWord>(_qec, cvar, word);
Expand Down
4 changes: 2 additions & 2 deletions src/engine/TextIndexScanForEntity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,10 @@ VariableToColumnMap TextIndexScanForEntity::computeVariableToColumnMap() const {
};
addDefinedVar(textRecordVar_);
if (hasFixedEntity()) {
addDefinedVar(textRecordVar_.getScoreVariable(fixedEntity()));
addDefinedVar(textRecordVar_.getEntityScoreVariable(fixedEntity()));
} else {
addDefinedVar(entityVariable());
addDefinedVar(textRecordVar_.getScoreVariable(entityVariable()));
addDefinedVar(textRecordVar_.getEntityScoreVariable(entityVariable()));
}
return vcmap;
}
Expand Down
14 changes: 7 additions & 7 deletions src/engine/TextIndexScanForWord.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@ ProtoResult TextIndexScanForWord::computeResult(
IdTable idTable = getExecutionContext()->getIndex().getWordPostingsForTerm(
word_, getExecutionContext()->getAllocator());

// This filters out the word column. When the searchword is a prefix this
// column shows the word the prefix got extended to
if (!isPrefix_) {
IdTable smallIdTable{getExecutionContext()->getAllocator()};
smallIdTable.setNumColumns(1);
smallIdTable.resize(idTable.numRows());
ql::ranges::copy(idTable.getColumn(0), smallIdTable.getColumn(0).begin());

return {std::move(smallIdTable), resultSortedOn(), LocalVocab{}};
using CI = ColumnIndex;
idTable.setColumnSubset(std::array{CI{0}, CI{2}});
return {std::move(idTable), resultSortedOn(), LocalVocab{}};
}

// Add details to the runtimeInfo. This is has no effect on the result.
Expand All @@ -46,12 +45,13 @@ VariableToColumnMap TextIndexScanForWord::computeVariableToColumnMap() const {
addDefinedVar(textRecordVar_.getMatchingWordVariable(
std::string_view(word_).substr(0, word_.size() - 1)));
}
addDefinedVar(textRecordVar_.getWordScoreVariable(word_, isPrefix_));
return vcmap;
}

// _____________________________________________________________________________
size_t TextIndexScanForWord::getResultWidth() const {
return 1 + (isPrefix_ ? 1 : 0);
return 2 + (isPrefix_ ? 1 : 0);
}

// _____________________________________________________________________________
Expand Down
7 changes: 5 additions & 2 deletions src/index/FTSAlgorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,21 @@
// _____________________________________________________________________________
IdTable FTSAlgorithms::filterByRange(const IdRange<WordVocabIndex>& idRange,
const IdTable& idTablePreFilter) {
AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 2);
AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 3);
LOG(DEBUG) << "Filtering " << idTablePreFilter.getColumn(0).size()
<< " elements by ID range...\n";

IdTable idTableResult{idTablePreFilter.getAllocator()};
idTableResult.setNumColumns(2);
idTableResult.setNumColumns(3);
idTableResult.resize(idTablePreFilter.getColumn(0).size());

decltype(auto) resultCidColumn = idTableResult.getColumn(0);
decltype(auto) resultWidColumn = idTableResult.getColumn(1);
decltype(auto) resultSidColumn = idTableResult.getColumn(2);
size_t nofResultElements = 0;
decltype(auto) preFilterCidColumn = idTablePreFilter.getColumn(0);
decltype(auto) preFilterWidColumn = idTablePreFilter.getColumn(1);
decltype(auto) preFilterSidColumn = idTablePreFilter.getColumn(2);
// TODO<C++23> Use views::zip.
for (size_t i = 0; i < preFilterWidColumn.size(); ++i) {
// TODO<joka921> proper Ids for the text stuff.
Expand All @@ -36,6 +38,7 @@ IdTable FTSAlgorithms::filterByRange(const IdRange<WordVocabIndex>& idRange,
preFilterWidColumn[i].getWordVocabIndex() <= idRange.last()) {
resultCidColumn[nofResultElements] = preFilterCidColumn[i];
resultWidColumn[nofResultElements] = preFilterWidColumn[i];
resultSidColumn[nofResultElements] = preFilterSidColumn[i];
nofResultElements++;
}
}
Expand Down
5 changes: 5 additions & 0 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,11 @@ size_t Index::getNofEntityPostings() const {
return pimpl_->getNofEntityPostings();
}

// ____________________________________________________________________________
size_t Index::getNofNonLiteralsInTextIndex() const {
return pimpl_->getNofNonLiteralsInTextIndex();
}

// ____________________________________________________________________________
Index::NumNormalAndInternal Index::numDistinctSubjects() const {
return pimpl_->numDistinctSubjects();
Expand Down
1 change: 1 addition & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ class Index {
size_t getNofTextRecords() const;
size_t getNofWordPostings() const;
size_t getNofEntityPostings() const;
size_t getNofNonLiteralsInTextIndex() const;

NumNormalAndInternal numDistinctSubjects() const;
NumNormalAndInternal numDistinctObjects() const;
Expand Down
25 changes: 20 additions & 5 deletions src/index/IndexImpl.Text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ cppcoro::generator<ContextFileParser::Line> IndexImpl::wordsInTextRecords(
if (!isLiteral(text)) {
continue;
}
ContextFileParser::Line entityLine{text, true, contextId, 1};
ContextFileParser::Line entityLine{text, true, contextId, 1, true};
co_yield entityLine;
std::string_view textView = text;
textView = textView.substr(0, textView.rfind('"'));
Expand Down Expand Up @@ -235,10 +235,12 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
ad_utility::HashMap<WordIndex, Score> wordsInContext;
ad_utility::HashMap<Id, Score> entitiesInContext;
auto currentContext = TextRecordIndex::make(0);
// The nofContexts can be misleading since it also counts empty contexts
size_t nofContexts = 0;
size_t nofWordPostings = 0;
size_t nofEntityPostings = 0;
size_t entityNotFoundErrorMsgCount = 0;
size_t nofLiterals = 0;

for (auto line : wordsInTextRecords(contextFile, addWordsFromLiterals)) {
if (line._contextId != currentContext) {
Expand All @@ -258,6 +260,9 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
// Note that `entitiesInContext` is a HashMap, so the `Id`s don't have
// to be contiguous.
entitiesInContext[Id::makeFromVocabIndex(eid)] += line._score;
if (line._isLiteralEntity) {
++nofLiterals;
}
} else {
if (entityNotFoundErrorMsgCount < 20) {
LOG(WARN) << "Entity from text not in KB: " << line._word << '\n';
Expand Down Expand Up @@ -294,6 +299,10 @@ void IndexImpl::processWordsForInvertedLists(const string& contextFile,
textMeta_.setNofTextRecords(nofContexts);
textMeta_.setNofWordPostings(nofWordPostings);
textMeta_.setNofEntityPostings(nofEntityPostings);
nofNonLiteralsInTextIndex_ = nofContexts - nofLiterals;
configurationJson_["num-non-literals-text-index"] =
nofNonLiteralsInTextIndex_;
writeConfiguration();

writer.finish();
LOG(TRACE) << "END IndexImpl::passContextFileIntoVector" << std::endl;
Expand Down Expand Up @@ -415,7 +424,7 @@ ContextListMetaData IndexImpl::writePostings(ad_utility::File& out,

size_t n = 0;

WordToCodeMap wordCodeMap;
WordCodeMap wordCodeMap;
WordCodebook wordCodebook;
ScoreCodeMap scoreCodeMap;
ScoreCodebook scoreCodebook;
Expand Down Expand Up @@ -646,10 +655,11 @@ size_t IndexImpl::writeList(Numeric* data, size_t nofElements,

// _____________________________________________________________________________
void IndexImpl::createCodebooks(const vector<IndexImpl::Posting>& postings,
IndexImpl::WordToCodeMap& wordCodemap,
IndexImpl::WordCodeMap& wordCodemap,
IndexImpl::WordCodebook& wordCodebook,
IndexImpl::ScoreCodeMap& scoreCodemap,
IndexImpl::ScoreCodebook& scoreCodebook) const {
// There should be a more efficient way to do this (Felix Meisen)
ad_utility::HashMap<WordIndex, size_t> wfMap;
ad_utility::HashMap<Score, size_t> sfMap;
for (const auto& p : postings) {
Expand Down Expand Up @@ -718,7 +728,7 @@ std::string_view IndexImpl::wordIdToString(WordIndex wordIndex) const {
IdTable IndexImpl::readWordCl(
const TextBlockMetaData& tbmd,
const ad_utility::AllocatorWithLimit<Id>& allocator) const {
IdTable idTable{2, allocator};
IdTable idTable{3, allocator};
vector<TextRecordIndex> cids = readGapComprList<TextRecordIndex>(
tbmd._cl._nofElements, tbmd._cl._startContextlist,
static_cast<size_t>(tbmd._cl._startWordlist - tbmd._cl._startContextlist),
Expand All @@ -734,6 +744,11 @@ IdTable IndexImpl::readWordCl(
idTable.getColumn(1).begin(), [](WordIndex id) {
return Id::makeFromWordVocabIndex(WordVocabIndex::make(id));
});
std::ranges::transform(
readFreqComprList<Score>(tbmd._cl._nofElements, tbmd._cl._startScorelist,
static_cast<size_t>(tbmd._cl._lastByte + 1 -
tbmd._cl._startScorelist)),
idTable.getColumn(2).begin(), &Id::makeFromInt);
return idTable;
}

Expand Down Expand Up @@ -772,7 +787,7 @@ IdTable IndexImpl::getWordPostingsForTerm(
const ad_utility::AllocatorWithLimit<Id>& allocator) const {
LOG(DEBUG) << "Getting word postings for term: " << term << '\n';
IdTable idTable{allocator};
idTable.setNumColumns(term.ends_with('*') ? 2 : 1);
idTable.setNumColumns(term.ends_with('*') ? 3 : 2);
auto optionalTbmd = getTextBlockMetadataForWordOrPrefix(term);
if (!optionalTbmd.has_value()) {
return idTable;
Expand Down
1 change: 1 addition & 0 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1128,6 +1128,7 @@ void IndexImpl::readConfiguration() {
loadDataMember("num-subjects", numSubjects_, NumNormalAndInternal{});
loadDataMember("num-objects", numObjects_, NumNormalAndInternal{});
loadDataMember("num-triples", numTriples_, NumNormalAndInternal{});
loadDataMember("num-non-literals-text-index", nofNonLiteralsInTextIndex_, 0);

// Initialize BlankNodeManager
uint64_t numBlankNodesTotal;
Expand Down
15 changes: 13 additions & 2 deletions src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,11 @@ class IndexImpl {
NumNormalAndInternal numTriples_;
string indexId_;

// Keeps track of the number of nonLiteral contexts in the index this is used
// in the test retrieval of the texts. This only works reliably if the
// wordsFile.tsv starts with contextId 1 and is continuous.
size_t nofNonLiteralsInTextIndex_;

// Global static pointers to the currently active index and comparator.
// Those are used to compare LocalVocab entries with each other as well as
// with Vocab entries.
Expand Down Expand Up @@ -424,6 +429,9 @@ class IndexImpl {
size_t getNofEntityPostings() const {
return textMeta_.getNofEntityPostings();
}
size_t getNofNonLiteralsInTextIndex() const {
return nofNonLiteralsInTextIndex_;
}

bool hasAllPermutations() const { return SPO().isLoaded(); }

Expand Down Expand Up @@ -624,14 +632,17 @@ class IndexImpl {
ad_utility::File& file) const;

// TODO<joka921> understand what the "codes" are, are they better just ints?
typedef ad_utility::HashMap<WordIndex, CompressionCode> WordToCodeMap;
// After using createCodebooks on these types, the lowest codes refer to the
// most frequent WordIndex/Score. The maps are mapping those codes to their
// respective frequency.
typedef ad_utility::HashMap<WordIndex, CompressionCode> WordCodeMap;
typedef ad_utility::HashMap<Score, Score> ScoreCodeMap;
typedef vector<CompressionCode> WordCodebook;
typedef vector<Score> ScoreCodebook;

//! Creates codebooks for lists that are supposed to be entropy encoded.
void createCodebooks(const vector<Posting>& postings,
WordToCodeMap& wordCodemap, WordCodebook& wordCodebook,
WordCodeMap& wordCodemap, WordCodebook& wordCodebook,
ScoreCodeMap& scoreCodemap,
ScoreCodebook& scoreCodebook) const;

Expand Down
1 change: 1 addition & 0 deletions src/parser/ContextFileParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class ContextFileParser {
bool _isEntity;
TextRecordIndex _contextId;
Score _score;
bool _isLiteralEntity = false;
};

explicit ContextFileParser(const string& contextFile,
Expand Down
Loading

0 comments on commit a97905e

Please sign in to comment.