Skip to content

Commit

Permalink
Completely refactor the fulltext operations (ad-freiburg#1093)
Browse files Browse the repository at this point in the history
As of this commit, the fulltext index (triggered by `ql:contains-word` and `ql:contains-entity`) uses two basic operations:
1. `TextIndexScanForWord`: For a given word or prefix, return all text records that contain the word, (possibly together with the matched word in the case of a prefix, and the score of the match).
2. `TextIndexScanForEntity`: For a given word or prefix, return a superset of all pairs of `(text, entity)` where the entity is contained in the text according to `ql:contains-entity` and the text contains the `word`. For technical reasons this is a superset: We always have to scan the complete block from the half-inverted index which might belong to a shorter prefix.

The general processing is then as follows:
* For each word or prefix that appears as part of the object of a `ql:contains-word` triple, a `TextIndexScanForWord` is created.
* For each entity or variable that appears as the object of a `ql:contains-entity` triple, a `TextIndexScanForEntity` is created.
* The rest of the query processing is handled by the "ordinary" query planner using the normal operations like JOIN that are also used to process standard SPARQL queries.

This is much cleaner than the old `TextOperationWith[out]Filter` operations which combined the functionality of the above scan operations with JOIN operations, because the old approach lead to a lot of code duplication (the code for a join of two tables was duplicated for the fulltext module) and because the new approach makes queries easier to optimize and to reason about because the runtime information trees become much clearer if the scans and joins are represented separately.
  • Loading branch information
NickG-1 authored Jan 18, 2024
1 parent f7c2c32 commit 8f9b13a
Show file tree
Hide file tree
Showing 33 changed files with 1,624 additions and 863 deletions.
106 changes: 73 additions & 33 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ queries:
- query: relativ-star-scientists
type: text
sparql: |
SELECT ?x ?t ?ql_textscore_t WHERE {
SELECT ?x ?t ?ql_score_t_var_x WHERE {
?x <is-a> <Scientist> .
?t ql:contains-entity ?x .
?t ql:contains-word "relati*"
}
ORDER BY DESC(?ql_textscore_t)
ORDER BY DESC(?ql_score_t_var_x)
checks:
- num_cols: 3
- num_rows: 4285
- selected: [ "?x", "?t", "?ql_textscore_t"]
- selected: [ "?x", "?t", "?ql_score_t_var_x"]
- contains_row:
- "<Albert_Einstein>"
- "He realized, however, that the principle of relativity could also be extended
Expand All @@ -23,30 +23,27 @@ queries:
- null
- contains_row: [ "<Albert_Einstein>", null, null ] # null cells are ignored
- contains_row: [ "<Luís_Lindley_Cintra>", null, null ] # Test Unicode
- order_numeric: {"dir" : "DESC", "var": "?ql_textscore_t"}
- order_numeric: {"dir" : "DESC", "var": "?ql_score_t_var_x"}


- query: relativ-star-scientists-from-ulm # should use TextOperationWithFilter
- query: relativ-star-scientists-from-ulm
type: text
sparql: |
SELECT ?x ?t ?ql_textscore_t WHERE {
SELECT ?x ?t WHERE {
?x <is-a> <Scientist> .
?x <Place_of_birth> <Ulm> .
?t ql:contains-entity ?x .
?t ql:contains-word "relati*"
}
ORDER BY DESC(?ql_textscore_t)
TEXTLIMIT 1
checks:
- num_cols: 3
- num_rows: 1
- selected: [ "?x", "?t", "?ql_textscore_t" ]
- num_cols: 2
- num_rows: 172
- selected: [ "?x", "?t"]
- contains_row:
- "<Albert_Einstein>"
- "He realized, however, that the principle of relativity could also be extended
to gravitational fields, and with his subsequent theory of gravitation in 1916,
he published a paper on general relativity."
- null

- query: relat-star-Physikalische-real-star-scientists-from-ulm
type: text
Expand All @@ -55,11 +52,11 @@ queries:
?x <is-a> <Scientist> .
?x <Place_of_birth> <Ulm> .
?t ql:contains-entity ?x .
?t ql:contains-word "relat* Physikalische rela*"
?t ql:contains-word "RElaT* phySIKalische rela*"
}
checks:
- num_cols: 5
- selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
- contains_row:
- "<Albert_Einstein>"
- null
Expand Down Expand Up @@ -88,73 +85,116 @@ queries:
- query: algo-star-female-scientists
type: text
sparql: |
SELECT ?x ?ql_textscore_t WHERE {
SELECT ?x ?ql_score_t_var_x WHERE {
?x <is-a> <Scientist> .
?x <Gender> <Female> .
?t ql:contains-entity ?x .
?t ql:contains-word "algo*"
}
ORDER BY DESC(?ql_textscore_t)
ORDER BY DESC(?ql_score_t_var_x)
checks:
- num_cols: 2
- num_rows: 27
- selected: [ "?x", "?ql_textscore_t" ]
- selected: [ "?x", "?ql_score_t_var_x" ]
- contains_row: [ "<Grete_Hermann>", null ]
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}
- order_numeric: {"dir": "DESC", "var" : "?ql_score_t_var_x"}


- query: algor-start-female-born-before-1940
- query: algor-star-female-born-before-1940
type: text
sparql: |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?x ?date ?t ?ql_textscore_t ?ql_matchingword_t_algor WHERE {
SELECT ?x ?date ?t ?ql_matchingword_t_algor WHERE {
?x <is-a> <Scientist> .
?x <Date_of_birth> ?date .
?x <Gender> <Female> .
?t ql:contains-entity ?x .
?t ql:contains-word "algor*" .
FILTER (?date < "1940-01-01"^^xsd:date)
}
ORDER BY DESC(?ql_textscore_t)
checks:
- num_cols: 5
- num_cols: 4
- num_rows: 4
- contains_row:
- "<Grete_Hermann>"
- "1901-03-02"
- "Hermann's algorithm for primary decomposition is still in use now."
- null
- "algorithm"
- contains_row:
- "<Ada_Lovelace>"
- "1815-12-10"
- "Her notes on the engine include what is recognised as the first algorithm intended to be carried out by a machine."
- null
- "algorithm"
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}

- query: algorithm-hermann-start-female-born-before-1940
- query: algor-star-female-fixedEntity-ada-ordered
type: text
sparql: |
SELECT * WHERE {
?scientist <is-a> <Scientist> .
?scientist <Gender> <Female> .
?text ql:contains-entity ?scientist .
?text ql:contains-entity <Ada_Lovelace> .
?text ql:contains-word "rela*" .
}
ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
checks:
- num_cols: 5
- num_rows: 7
- contains_row:
- "<Ada_Lovelace>"
- null
- "As a teenager, her mathematical talents led her to an ongoing
working relationship and friendship with fellow British mathematician
Charles Babbage, also known as' the father of computers', and in
particular, Babbage's work on the Analytical Engine."
- null
- "relationship"
- order_numeric: {"dir": "DESC",
"var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}

- query: algor-star-female-fixedEntity-ada-fixed-Entity-mary
type: text
sparql: |
SELECT * WHERE {
?scientist <is-a> <Scientist> .
?scientist <Gender> <Female> .
?text ql:contains-entity ?scientist .
?text ql:contains-entity <Ada_Lovelace> .
?text ql:contains-entity <Mary_Somerville> .
?text ql:contains-word "rela*" .
}
checks:
- num_cols: 6
- num_rows: 2
- contains_row:
- "<Ada_Lovelace>"
- null
- "She became fascinated with the machine and used her relationship
with Somerville to visit Babbage as often as she could."
- null
- null
- "relationship"


- query: algorithm-hermann-star-female-born-before-1940
type: text
sparql: |
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
SELECT ?x ?date ?t ?ql_textscore_t WHERE {
SELECT ?x ?date ?t WHERE {
?x <is-a> <Scientist> .
?x <Date_of_birth> ?date .
?x <Gender> <Female> .
?t ql:contains-entity ?x .
?t ql:contains-word "algorithm hermann" .
FILTER (?date < "1940-01-01"^^xsd:date)
}
ORDER BY DESC(?ql_textscore_t)
checks:
- num_cols: 4
- num_cols: 3
- num_rows: 1
- contains_row:
- "<Grete_Hermann>"
- "1901-03-02"
- "Hermann's algorithm for primary decomposition is still in use now."
- null
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}

- query: people-born-in-1901
type: no-text
Expand Down Expand Up @@ -1239,11 +1279,11 @@ queries:
?x <Gender> <Female> .
?t ql:contains-entity ?x .
?t ql:contains-word "algo* herm* primary"
} TEXTLIMIT 1
}
checks:
- num_cols: 5
- num_rows: 1
- selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
- contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ]


Expand Down
2 changes: 1 addition & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@ add_library(engine
Union.cpp MultiColumnJoin.cpp TransitivePath.cpp Service.cpp
Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
idTable/CompressedExternalIdTable.h)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams)
10 changes: 10 additions & 0 deletions src/engine/QueryExecutionTree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#include "engine/OrderBy.h"
#include "engine/Service.h"
#include "engine/Sort.h"
#include "engine/TextIndexScanForEntity.h"
#include "engine/TextIndexScanForWord.h"
#include "engine/TextOperationWithFilter.h"
#include "engine/TextOperationWithoutFilter.h"
#include "engine/TransitivePath.h"
Expand Down Expand Up @@ -176,6 +178,10 @@ void QueryExecutionTree::setOperation(std::shared_ptr<Op> operation) {
type_ = TEXT_WITH_FILTER;
} else if constexpr (std::is_same_v<Op, TextOperationWithoutFilter>) {
type_ = TEXT_WITHOUT_FILTER;
} else if constexpr (std::is_same_v<Op, TextIndexScanForWord>) {
type_ = TEXT_INDEX_SCAN_FOR_WORD;
} else if constexpr (std::is_same_v<Op, TextIndexScanForEntity>) {
type_ = TEXT_INDEX_SCAN_FOR_ENTITY;
} else if constexpr (std::is_same_v<Op, CountAvailablePredicates>) {
type_ = COUNT_AVAILABLE_PREDICATES;
} else if constexpr (std::is_same_v<Op, Minus>) {
Expand Down Expand Up @@ -217,6 +223,10 @@ template void QueryExecutionTree::setOperation(
std::shared_ptr<TextOperationWithFilter>);
template void QueryExecutionTree::setOperation(
std::shared_ptr<TextOperationWithoutFilter>);
template void QueryExecutionTree::setOperation(
std::shared_ptr<TextIndexScanForWord>);
template void QueryExecutionTree::setOperation(
std::shared_ptr<TextIndexScanForEntity>);
template void QueryExecutionTree::setOperation(
std::shared_ptr<CountAvailablePredicates>);
template void QueryExecutionTree::setOperation(std::shared_ptr<Minus>);
Expand Down
2 changes: 2 additions & 0 deletions src/engine/QueryExecutionTree.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class QueryExecutionTree {
DISTINCT,
TEXT_WITHOUT_FILTER,
TEXT_WITH_FILTER,
TEXT_INDEX_SCAN_FOR_WORD,
TEXT_INDEX_SCAN_FOR_ENTITY,
OPTIONAL_JOIN,
COUNT_AVAILABLE_PREDICATES,
GROUP_BY,
Expand Down
Loading

0 comments on commit 8f9b13a

Please sign in to comment.