Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a VALUES clause to the query of a SERVICE clause to simplify the execution #1341

Merged
merged 13 commits into from
Jun 5, 2024
36 changes: 36 additions & 0 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1641,6 +1641,13 @@
return {makeSubtreePlan<OptionalJoin>(_qec, a._qet, b._qet)};
}

// Check if one of the two Operations is a SERVICE. If so, we can try
// to simplify the Service Query using the result of the other operation.
if (auto opt = createJoinWithService(a, b, jcs)) {
candidates.push_back(std::move(opt.value()));
return candidates;
}

Check warning on line 1649 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1647-L1649

Added lines #L1647 - L1649 were not covered by tests

if (jcs.size() >= 2) {
// If there are two or more join columns and we are not using the
// TwoColumnJoin (the if part before this comment), use a multiColumnJoin.
Expand Down Expand Up @@ -1769,6 +1776,35 @@
return plan;
}

// _____________________________________________________________________
auto QueryPlanner::createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs)
-> std::optional<SubtreePlan> {
auto aRootOp = std::dynamic_pointer_cast<Service>(a._qet->getRootOperation());
auto bRootOp = std::dynamic_pointer_cast<Service>(b._qet->getRootOperation());

// Exactly one of the two Operations can be a service.
if (static_cast<bool>(aRootOp) == static_cast<bool>(bRootOp)) {
return std::nullopt;
}

auto service = aRootOp ? aRootOp : bRootOp;
auto sibling = bRootOp ? a : b;

service->setSiblingTree(sibling._qet);

Check warning on line 1795 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1795

Added line #L1795 was not covered by tests

const auto& qec = service->getExecutionContext();

Check warning on line 1797 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1797

Added line #L1797 was not covered by tests

SubtreePlan plan =

Check warning on line 1799 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1799

Added line #L1799 was not covered by tests
jcs.size() == 1
? makeSubtreePlan<Join>(qec, a._qet, b._qet, jcs[0][0], jcs[0][1])
: makeSubtreePlan<MultiColumnJoin>(qec, a._qet, b._qet);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should add unit tests for this planning (in the QueryPlannerTest.cpp and QueryPlannerTestHelpers.h you can add a matcher for a service clause, have a look at that code, and let me know if there is trouble)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have implemented a simple test for this in this commit, let me know if that is sufficient or provide feedback for further improvements.

mergeSubtreePlanIds(plan, a, b);

Check warning on line 1803 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1801-L1803

Added lines #L1801 - L1803 were not covered by tests
joka921 marked this conversation as resolved.
Show resolved Hide resolved

return plan;

Check warning on line 1805 in src/engine/QueryPlanner.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/QueryPlanner.cpp#L1805

Added line #L1805 was not covered by tests
}

// _____________________________________________________________________
void QueryPlanner::QueryGraph::setupGraph(
const std::vector<SubtreePlan>& leafOperations) {
Expand Down
4 changes: 4 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,10 @@ class QueryPlanner {
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] static std::optional<SubtreePlan> createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] vector<SubtreePlan> getOrderByRow(
const ParsedQuery& pq,
const std::vector<std::vector<SubtreePlan>>& dpTab) const;
Expand Down
70 changes: 68 additions & 2 deletions src/engine/Service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
#include "engine/Service.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include "engine/CallFixedSize.h"
#include "engine/ExportQueryExecutionTrees.h"
#include "engine/Values.h"
#include "engine/VariableToColumnMap.h"
#include "global/RuntimeParameters.h"
#include "parser/TokenizerCtre.h"
#include "parser/TurtleParser.h"
#include "util/Exception.h"
Expand All @@ -21,10 +24,12 @@
// ____________________________________________________________________________
Service::Service(QueryExecutionContext* qec,
parsedQuery::Service parsedServiceClause,
GetTsvFunction getTsvFunction)
GetTsvFunction getTsvFunction,
std::shared_ptr<QueryExecutionTree> siblingTree)
: Operation{qec},
parsedServiceClause_{std::move(parsedServiceClause)},
getTsvFunction_{std::move(getTsvFunction)} {}
getTsvFunction_{std::move(getTsvFunction)},
siblingTree_{std::move(siblingTree)} {}

// ____________________________________________________________________________
std::string Service::getCacheKeyImpl() const {
Expand Down Expand Up @@ -92,6 +97,14 @@
serviceIriString.remove_suffix(1);
ad_utility::httpUtils::Url serviceUrl{serviceIriString};

// Try to simplify the Service Query using it's sibling Operation.
if (auto valuesClause = getSiblingValuesClause(); valuesClause.has_value()) {
auto openBracketPos = parsedServiceClause_.graphPatternAsString_.find('{');
parsedServiceClause_.graphPatternAsString_ =
"{\n" + valuesClause.value() + '\n' +
parsedServiceClause_.graphPatternAsString_.substr(openBracketPos + 1);
}

// Construct the query to be sent to the SPARQL endpoint.
std::string variablesForSelectClause = absl::StrJoin(
parsedServiceClause_.visibleVariables_, " ", Variable::AbslFormatter);
Expand Down Expand Up @@ -159,6 +172,59 @@
return {std::move(idTable), resultSortedOn(), std::move(localVocab)};
}

// ____________________________________________________________________________
std::optional<std::string> Service::getSiblingValuesClause() const {
joka921 marked this conversation as resolved.
Show resolved Hide resolved
if (siblingTree_ == nullptr) {
return std::nullopt;
}

const auto& siblingResult = siblingTree_->getResult();
if (siblingResult->size() >
RuntimeParameters().get<"service-max-value-rows">()) {
return std::nullopt;
}

Check warning on line 185 in src/engine/Service.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Service.cpp#L184-L185

Added lines #L184 - L185 were not covered by tests

std::vector<ColumnIndex> commonColumnIndices;
const auto& siblingVars = siblingTree_->getVariableColumns();
std::string vars = "(";
for (const auto& localVar : parsedServiceClause_.visibleVariables_) {
auto it = siblingVars.find(localVar);
if (it == siblingVars.end()) {
continue;
}

Check warning on line 194 in src/engine/Service.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Service.cpp#L193-L194

Added lines #L193 - L194 were not covered by tests
vars += it->first.name() + " ";
commonColumnIndices.push_back(it->second.columnIndex_);
}
vars.back() = ')';

ad_utility::HashSet<std::string> rowSet;

std::string values = " { ";
for (size_t rowIndex = 0; rowIndex < siblingResult->size(); ++rowIndex) {
std::string row = "(";
for (size_t i = 0; i < commonColumnIndices.size(); ++i) {
const auto& optionalString = ExportQueryExecutionTrees::idToStringAndType(
siblingTree_->getRootOperation()->getIndex(),
siblingResult->idTable()(rowIndex, commonColumnIndices[i]),
siblingResult->localVocab());
Copy link
Contributor Author

@UNEXENU UNEXENU May 16, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Somehow the localVocab accessed here with siblingResult->localVocab() doesn't contain useful data.
As a result the optionalString has no Content.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll have a look at this. I however don't suspect that the problem is the local vocab.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you send me a reproducer (Dataset + Query) where something doesn't work as expected?
I just tried a simple example with one variable where this worked.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok sry, here is a reproducable example query, i use it with Olympics Dataset but it doesn't use the local dataset anyway:

PREFIX schema: <http://schema.org/>
PREFIX imdb: <https://www.imdb.com/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX p: <http://www.wikidata.org/prop/>
SELECT ?imdb_id ?imdb_votes ?imdb_rating WHERE {
VALUES ?imdb_id { "tt0477348" "tt0118715" "tt0116282" } .
  SERVICE <https://qlever.cs.uni-freiburg.de/api/imdb> {
	?movie_imdb imdb:id ?imdb_id .
	?movie_imdb imdb:type "movie" .
    ?movie_imdb imdb:numVotes ?imdb_votes .
    ?movie_imdb imdb:averageRating ?imdb_rating .
  }
}
ORDER BY DESC(?imdb_votes)

Expected result: 3 rows with the given imdb_id values.
Actual result: 0 rows, Debug log shows that the sent Service-Query contains a Values Clause with imdb_id as key but no values.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I figured it out....
Your code is absolutely working as it should, the problem was the implementation of the VALUES clause.
Basically you could neither evaluate it twice, nor read it from the cache during the same query.
I have pushed a quick fix to your branch (just do a git pull in your local copy to get it) and everything should work as expected.


if (optionalString.has_value()) {
row += optionalString.value().first + ' ';
}
}
row.back() = ')';

if (rowSet.contains(row)) {
continue;
}

Check warning on line 219 in src/engine/Service.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Service.cpp#L218-L219

Added lines #L218 - L219 were not covered by tests

rowSet.insert(row);
values += row + " ";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all your someString += something + something other
use absl::StrAppend. It is more efficient and more idiomatic. (three places in this file)

}

return "VALUES " + vars + values + "} . ";
}

// ____________________________________________________________________________
template <size_t I>
void Service::writeTsvResult(cppcoro::generator<std::string_view> tsvResult,
Expand Down
15 changes: 14 additions & 1 deletion src/engine/Service.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@
// The function used to obtain the result from the remote endpoint.
GetTsvFunction getTsvFunction_;

// The siblingTree, used for SERVICE clause optimization.
std::shared_ptr<QueryExecutionTree> siblingTree_;

public:
// Construct from parsed Service clause.
//
Expand All @@ -54,7 +57,14 @@
// but in our tests (`ServiceTest`) we use a mock function that does not
// require a running `HttpServer`.
Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause,
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest);
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest,
std::shared_ptr<QueryExecutionTree> siblingTree = nullptr);

// Set the siblingTree (subTree that will later be joined with the Result of
// the Service Operation), used to reduce the Service Queries Complexity.
void setSiblingTree(std::shared_ptr<QueryExecutionTree> siblingTree) {
siblingTree_ = siblingTree;
}

Check warning on line 67 in src/engine/Service.h

View check run for this annotation

Codecov / codecov/patch

src/engine/Service.h#L65-L67

Added lines #L65 - L67 were not covered by tests

// Methods inherited from base class `Operation`.
std::string getDescriptor() const override;
Expand Down Expand Up @@ -82,6 +92,9 @@
// Compute the result using `getTsvFunction_`.
ResultTable computeResult() override;

// Get a VALUES clause that contains the values of the siblingTree's result.
std::optional<std::string> getSiblingValuesClause() const;
joka921 marked this conversation as resolved.
Show resolved Hide resolved

// Write the given TSV result to the given result object. The `I` is the width
// of the result table.
//
Expand Down
4 changes: 3 additions & 1 deletion src/engine/Values.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,9 @@ void Values::writeValues(IdTable* idTablePtr, LocalVocab* localVocab) {
for (auto& row : parsedValues_._values) {
for (size_t colIdx = 0; colIdx < idTable.numColumns(); colIdx++) {
TripleComponent& tc = row[colIdx];
Id id = std::move(tc).toValueId(getIndex().getVocab(), *localVocab);
// TODO<joka921> We don't want to move, but also don't want to
// unconditionally copy.
Id id = TripleComponent{tc}.toValueId(getIndex().getVocab(), *localVocab);
idTable(rowIdx, colIdx) = id;
if (id.getDatatype() == Datatype::LocalVocabIndex) {
++numLocalVocabPerColumn[colIdx];
Expand Down
3 changes: 2 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ inline auto& RuntimeParameters() {
30s}),
SizeT<"lazy-index-scan-max-size-materialization">{1'000'000},
Bool<"use-binsearch-transitive-path">{true},
Bool<"group-by-hash-map-enabled">{false}};
Bool<"group-by-hash-map-enabled">{false},
SizeT<"service-max-value-rows">{100}};
}();
return params;
}
Expand Down
30 changes: 30 additions & 0 deletions test/ServiceTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "parser/GraphPatternOperation.h"
#include "util/IdTableHelpers.h"
#include "util/IndexTestHelpers.h"
#include "util/TripleComponentTestHelpers.h"
#include "util/http/HttpUtils.h"

// Fixture that sets up a test index and a factory for producing mocks for the
Expand Down Expand Up @@ -192,4 +193,33 @@ TEST_F(ServiceTest, computeResult) {
IdTable expectedIdTable = makeIdTableFromVector(
{{idX, idY}, {idBla, idBli}, {idBlu, idBla}, {idBli, idBlu}});
EXPECT_EQ(result->idTable(), expectedIdTable);

// Check 5: When a siblingTree with variables common to the Service Clause is
// passed, the Service Operation shall use the siblings result to reduce
// its Query complexity by injecting them as Value Clause
auto iri = ad_utility::testing::iri;
using TC = TripleComponent;
auto siblingTree = std::make_shared<QueryExecutionTree>(
testQec,
std::make_shared<Values>(
testQec,
(parsedQuery::SparqlValues){
{Variable{"?x"}, Variable{"?y"}, Variable{"?z"}},
{{TC(iri("<x>")), TC(iri("<y>")), TC(iri("<z>"))},
{TC(iri("<blu>")), TC(iri("<bla>")), TC(iri("<blo>"))}}}));

auto parsedServiceClause5 = parsedServiceClause;
parsedServiceClause5.graphPatternAsString_ = "{ ?x <ble> ?y . }";

std::string_view expectedSparqlQuery5 =
"PREFIX doof: <http://doof.org> SELECT ?x ?y "
"WHERE { VALUES (?x ?y) { (<x> <y>) (<blu> <bla>) } . ?x <ble> ?y . }";

Service serviceOperation5{
testQec, parsedServiceClause5,
getTsvFunctionFactory(
expectedUrl, expectedSparqlQuery5,
"?x\t?y\n<x>\t<y>\n<bla>\t<bli>\n<blu>\t<bla>\n<bli>\t<blu>\n"),
siblingTree};
EXPECT_NO_THROW(serviceOperation5.getResult());
}
Loading