Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a VALUES clause to the query of a SERVICE clause to simplify the execution #1341

Merged
merged 13 commits into from
Jun 5, 2024
30 changes: 30 additions & 0 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,11 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::createJoinCandidates(
candidates.push_back(std::move(opt.value()));
}

if (auto opt = createJoinWithService(a, b, jcs)) {
candidates.push_back(std::move(opt.value()));
return candidates;
}

// Test if one of `a` or `b` is a transitive path to which we can bind the
// other one.
if (auto opt = createJoinWithTransitivePath(a, b, jcs)) {
Expand Down Expand Up @@ -1769,6 +1774,31 @@ auto QueryPlanner::createJoinWithHasPredicateScan(
return plan;
}

// _____________________________________________________________________
auto QueryPlanner::createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs)
-> std::optional<SubtreePlan> {
auto aRootOp = std::dynamic_pointer_cast<Service>(a._qet->getRootOperation());
auto bRootOp = std::dynamic_pointer_cast<Service>(b._qet->getRootOperation());

// Exactly one of the two Operations can be a service.
if (aRootOp == bRootOp) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (aRootOp == bRootOp) {
if (static_cast<bool>(aRootOp) == static_cast<bool>(bRootOp)) {

The other one is very fishy (expects that you don't get passed the same Service twice).

return std::nullopt;
}

auto service = aRootOp ? aRootOp : bRootOp;
auto sibling = bRootOp ? a : b;

service->setSiblingTree(sibling._qet);

SubtreePlan plan = makeSubtreePlan<Join>(
service->getExecutionContext(), a._qet, b._qet, jcs[0][0], jcs[0][1]);
mergeSubtreePlanIds(plan, a, b);
joka921 marked this conversation as resolved.
Show resolved Hide resolved

return plan;
}

// _____________________________________________________________________
void QueryPlanner::QueryGraph::setupGraph(
const std::vector<SubtreePlan>& leafOperations) {
Expand Down
4 changes: 4 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ class QueryPlanner {
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] static std::optional<SubtreePlan> createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] vector<SubtreePlan> getOrderByRow(
const ParsedQuery& pq,
const std::vector<std::vector<SubtreePlan>>& dpTab) const;
Expand Down
95 changes: 59 additions & 36 deletions src/engine/Service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,14 @@
#include "engine/Service.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include "engine/CallFixedSize.h"
#include "engine/ExportQueryExecutionTrees.h"
#include "engine/Values.h"
#include "engine/VariableToColumnMap.h"
#include "global/RuntimeParameters.h"
#include "parser/TokenizerCtre.h"
#include "parser/TurtleParser.h"
#include "util/Exception.h"
Expand Down Expand Up @@ -95,42 +97,11 @@ ResultTable Service::computeResult() {
serviceIriString.remove_suffix(1);
ad_utility::httpUtils::Url serviceUrl{serviceIriString};

if (siblingTree_ != nullptr) {
// Get the result of the siblingTree, to (potentially)
// reduce complexity of the SERVICE query.
auto siblingResult = siblingTree_->getResult();

const size_t rowLimit = 100;
if (siblingResult->size() < rowLimit) {
auto siblingVariables = siblingTree_->getVariableColumns();

// Build value clause for each common variable.
std::string valueClauses = "{ ";
for (const auto& lVar : parsedServiceClause_.visibleVariables_) {
auto it = siblingVariables.find(lVar);
if (it == siblingVariables.end()) {
continue;
}
const auto& sVar = *it;

valueClauses += "VALUES " + sVar.first.name() + " { ";
for (size_t rowIndex = 0; rowIndex < siblingResult->size();
++rowIndex) {
const auto& optionalString =
ExportQueryExecutionTrees::idToStringAndType(
siblingTree_->getRootOperation()->getIndex(),
siblingResult->idTable()(rowIndex, sVar.second.columnIndex_),
siblingResult->localVocab());
if (optionalString.has_value()) {
valueClauses += optionalString.value().first + " ";
}
}
valueClauses += "} . ";
}

parsedServiceClause_.graphPatternAsString_ =
valueClauses + parsedServiceClause_.graphPatternAsString_.substr(2);
}
// Try to optimize the Service Clause using it's sibling Operation.
if (auto valuesClause = getSiblingValuesClause(); valuesClause.has_value()) {
parsedServiceClause_.graphPatternAsString_ =
"{ " + valuesClause.value() +
parsedServiceClause_.graphPatternAsString_.substr(2);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should probably find the first occurence of the {, I am not sure if this breaks as soon as someone puts random spaces in their query.

}

// Construct the query to be sent to the SPARQL endpoint.
Expand Down Expand Up @@ -200,6 +171,58 @@ ResultTable Service::computeResult() {
return {std::move(idTable), resultSortedOn(), std::move(localVocab)};
}

std::optional<std::string> Service::getSiblingValuesClause() const {
joka921 marked this conversation as resolved.
Show resolved Hide resolved
if (siblingTree_ == nullptr) {
return std::nullopt;
}

const auto& siblingResult = siblingTree_->getResult();
if (siblingResult->size() >
RuntimeParameters().get<"service-max-value-rows">()) {
return std::nullopt;
}

std::vector<ColumnIndex> commonColumnIndices;
const auto& siblingVars = siblingTree_->getVariableColumns();
std::string vars = "";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is one bug remaining:
If a row (combination of all relevant variables) appears multiple times in the sibling result, then you may only store it once in the VALUES clause, otherwise the SERVICE will duplicate each result leading to wrong results with too many rows.

for (const auto& localVar : parsedServiceClause_.visibleVariables_) {
auto it = siblingVars.find(localVar);
if (it == siblingVars.end()) {
continue;
}
vars += it->first.name() + " ";
commonColumnIndices.push_back(it->second.columnIndex_);
}
vars.pop_back();
if (commonColumnIndices.size() > 1) {
vars = "(" + vars + ")";
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about you ALWAYS add the parentheses, they also work for only one variable, and this makes the code simpler.


std::string values = " { ";
for (size_t rowIndex = 0; rowIndex < siblingResult->size(); ++rowIndex) {
std::string row;
for (size_t i = 0; i < commonColumnIndices.size(); ++i) {
const auto& optionalString = ExportQueryExecutionTrees::idToStringAndType(
siblingTree_->getRootOperation()->getIndex(),
siblingResult->idTable()(rowIndex, commonColumnIndices[i]),
siblingResult->localVocab());
Copy link
Contributor Author

@UNEXENU UNEXENU May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow the localVocab accessed here with siblingResult->localVocab() doesn't contain useful data.
As a result the optionalString has no Content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have a look at this. I however don't suspect that the problem is the local vocab.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you send me a reproducer (Dataset + Query) where something doesn't work as expected?
I just tried a simple example with one variable where this worked.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sry, here is a reproducable example query, i use it with Olympics Dataset but it doesn't use the local dataset anyway:

PREFIX schema: <http://schema.org/>
PREFIX imdb: <https://www.imdb.com/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
SELECT ?imdb_id ?imdb_votes ?imdb_rating WHERE {
VALUES ?imdb_id { "tt0477348" "tt0118715" "tt0116282" } .
  SERVICE <https://qlever.cs.uni-freiburg.de/api/imdb> {
	?movie_imdb imdb:id ?imdb_id .
	?movie_imdb imdb:type "movie" .
    ?movie_imdb imdb:numVotes ?imdb_votes .
    ?movie_imdb imdb:averageRating ?imdb_rating .
  }
}
ORDER BY DESC(?imdb_votes)

Expected result: 3 rows with the given imdb_id values.
Actual result: 0 rows, Debug log shows that the sent Service-Query contains a Values Clause with imdb_id as key but no values.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured it out....
Your code is absolutely working as it should, the problem was the implementation of the VALUES clause.
Basically you could neither evaluate it twice, nor read it from the cache during the same query.
I have pushed a quick fix to your branch (just do a git pull in your local copy to get it) and everything should work as expected.


if (optionalString.has_value()) {
row += optionalString.value().first;
if (i < commonColumnIndices.size() - 1) {
row += " ";
}
}
}
if (commonColumnIndices.size() > 1) {
row = "(" + row + ")";
}
values += row + " ";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all your someString += something + something other
use absl::StrAppend. It is more efficient and more idiomatic. (three places in this file)

}

return "VALUES " + vars + values + "} . ";
}

// ____________________________________________________________________________
template <size_t I>
void Service::writeTsvResult(cppcoro::generator<std::string_view> tsvResult,
Expand Down
6 changes: 6 additions & 0 deletions src/engine/Service.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ class Service : public Operation {
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest,
std::shared_ptr<QueryExecutionTree> siblingTree = nullptr);

inline void setSiblingTree(std::shared_ptr<QueryExecutionTree> siblingTree) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The inline doesn't do anything here.
And add a short docstring, the semantics of a sibling tree should be documented.

siblingTree_ = siblingTree;
}

// Methods inherited from base class `Operation`.
std::string getDescriptor() const override;
size_t getResultWidth() const override;
Expand All @@ -86,6 +90,8 @@ class Service : public Operation {
// Compute the result using `getTsvFunction_`.
ResultTable computeResult() override;

std::optional<std::string> getSiblingValuesClause() const;
joka921 marked this conversation as resolved.
Show resolved Hide resolved

// Write the given TSV result to the given result object. The `I` is the width
// of the result table.
//
Expand Down
3 changes: 2 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ inline auto& RuntimeParameters() {
30s}),
SizeT<"lazy-index-scan-max-size-materialization">{1'000'000},
Bool<"use-binsearch-transitive-path">{true},
Bool<"group-by-hash-map-enabled">{false}};
Bool<"group-by-hash-map-enabled">{false},
SizeT<"service-max-value-rows">{100}};
}();
return params;
}
Expand Down
3 changes: 1 addition & 2 deletions test/ServiceTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,7 @@ TEST_F(ServiceTest, computeResult) {

std::string_view expectedSparqlQuery5 =
"PREFIX doof: <http://doof.org> SELECT ?x ?y "
"WHERE { VALUES ?x { <x> <blu> } . VALUES ?y { <y> <bla> } . "
"?x <ble> ?y . }";
"WHERE { VALUES (?x ?y) { (<x> <y>) (<blu> <bla>) } . ?x <ble> ?y . }";

Service serviceOperation5{
testQec, parsedServiceClause5,
Expand Down