Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a VALUES clause to the query of a SERVICE clause to simplify the execution #1341

Merged
merged 13 commits into from
Jun 5, 2024
36 changes: 36 additions & 0 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1642,6 +1642,13 @@ std::vector<QueryPlanner::SubtreePlan> QueryPlanner::createJoinCandidates(
return {makeSubtreePlan<OptionalJoin>(_qec, a._qet, b._qet)};
}

// Check if one of the two Operations is a SERVICE. If so, we can try
// to simplify the Service Query using the result of the other operation.
if (auto opt = createJoinWithService(a, b, jcs)) {
candidates.push_back(std::move(opt.value()));
return candidates;
}

if (jcs.size() >= 2) {
// If there are two or more join columns and we are not using the
// TwoColumnJoin (the if part before this comment), use a multiColumnJoin.
Expand Down Expand Up @@ -1770,6 +1777,35 @@ auto QueryPlanner::createJoinWithHasPredicateScan(
return plan;
}

// _____________________________________________________________________
auto QueryPlanner::createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs)
-> std::optional<SubtreePlan> {
auto aRootOp = std::dynamic_pointer_cast<Service>(a._qet->getRootOperation());
auto bRootOp = std::dynamic_pointer_cast<Service>(b._qet->getRootOperation());

// Exactly one of the two Operations can be a service.
if (static_cast<bool>(aRootOp) == static_cast<bool>(bRootOp)) {
return std::nullopt;
}

auto service = aRootOp ? aRootOp : bRootOp;
auto sibling = bRootOp ? a : b;

service->setSiblingTree(sibling._qet);

const auto& qec = service->getExecutionContext();

SubtreePlan plan =
jcs.size() == 1
? makeSubtreePlan<Join>(qec, a._qet, b._qet, jcs[0][0], jcs[0][1])
: makeSubtreePlan<MultiColumnJoin>(qec, a._qet, b._qet);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should add unit tests for this planning (in the QueryPlannerTest.cpp and QueryPlannerTestHelpers.h you can add a matcher for a service clause, have a look at that code, and let me know if there is trouble)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have implemented a simple test for this in this commit, let me know if that is sufficient or provide feedback for further improvements.

mergeSubtreePlanIds(plan, a, b);
joka921 marked this conversation as resolved.
Show resolved Hide resolved

return plan;
}

// _____________________________________________________________________
void QueryPlanner::QueryGraph::setupGraph(
const std::vector<SubtreePlan>& leafOperations) {
Expand Down
4 changes: 4 additions & 0 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,10 @@ class QueryPlanner {
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] static std::optional<SubtreePlan> createJoinWithService(
SubtreePlan a, SubtreePlan b,
const std::vector<std::array<ColumnIndex, 2>>& jcs);

[[nodiscard]] vector<SubtreePlan> getOrderByRow(
const ParsedQuery& pq,
const std::vector<std::vector<SubtreePlan>>& dpTab) const;
Expand Down
83 changes: 80 additions & 3 deletions src/engine/Service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,14 @@
#include "engine/Service.h"

#include <absl/strings/str_cat.h>
#include <absl/strings/str_join.h>
#include <absl/strings/str_split.h>

#include "engine/CallFixedSize.h"
#include "engine/ExportQueryExecutionTrees.h"
#include "engine/Values.h"
#include "engine/VariableToColumnMap.h"
#include "global/RuntimeParameters.h"
#include "parser/TokenizerCtre.h"
#include "parser/TurtleParser.h"
#include "util/Exception.h"
Expand All @@ -21,18 +24,24 @@
// ____________________________________________________________________________
Service::Service(QueryExecutionContext* qec,
parsedQuery::Service parsedServiceClause,
GetTsvFunction getTsvFunction)
GetTsvFunction getTsvFunction,
std::shared_ptr<QueryExecutionTree> siblingTree)
: Operation{qec},
parsedServiceClause_{std::move(parsedServiceClause)},
getTsvFunction_{std::move(getTsvFunction)} {}
getTsvFunction_{std::move(getTsvFunction)},
siblingTree_{std::move(siblingTree)} {}

// ____________________________________________________________________________
std::string Service::getCacheKeyImpl() const {
std::ostringstream os;
// TODO: This duplicates code in GraphPatternOperation.cpp .
os << "SERVICE " << parsedServiceClause_.serviceIri_.toSparql() << " {\n"
<< parsedServiceClause_.prologue_ << "\n"
<< parsedServiceClause_.graphPatternAsString_ << "\n}\n";
<< parsedServiceClause_.graphPatternAsString_ << "\n";
if (siblingTree_ != nullptr) {
os << siblingTree_->getRootOperation()->getCacheKey() << "\n";
}
os << "}\n";
return std::move(os).str();
}

Expand Down Expand Up @@ -92,6 +101,14 @@ Result Service::computeResult([[maybe_unused]] bool requestLaziness) {
serviceIriString.remove_suffix(1);
ad_utility::httpUtils::Url serviceUrl{serviceIriString};

// Try to simplify the Service Query using it's sibling Operation.
if (auto valuesClause = getSiblingValuesClause(); valuesClause.has_value()) {
auto openBracketPos = parsedServiceClause_.graphPatternAsString_.find('{');
parsedServiceClause_.graphPatternAsString_ =
"{\n" + valuesClause.value() + '\n' +
parsedServiceClause_.graphPatternAsString_.substr(openBracketPos + 1);
}

// Construct the query to be sent to the SPARQL endpoint.
std::string variablesForSelectClause = absl::StrJoin(
parsedServiceClause_.visibleVariables_, " ", Variable::AbslFormatter);
Expand Down Expand Up @@ -159,6 +176,66 @@ Result Service::computeResult([[maybe_unused]] bool requestLaziness) {
return {std::move(idTable), resultSortedOn(), std::move(localVocab)};
}

// ____________________________________________________________________________
std::optional<std::string> Service::getSiblingValuesClause() const {
joka921 marked this conversation as resolved.
Show resolved Hide resolved
if (siblingTree_ == nullptr) {
return std::nullopt;
}

const auto& siblingResult = siblingTree_->getResult();
if (siblingResult->idTable().size() >
RuntimeParameters().get<"service-max-value-rows">()) {
return std::nullopt;
}

checkCancellation();

std::vector<ColumnIndex> commonColumnIndices;
const auto& siblingVars = siblingTree_->getVariableColumns();
std::string vars = "(";
for (const auto& localVar : parsedServiceClause_.visibleVariables_) {
auto it = siblingVars.find(localVar);
if (it == siblingVars.end()) {
continue;
}
absl::StrAppend(&vars, it->first.name(), " ");
commonColumnIndices.push_back(it->second.columnIndex_);
}
vars.back() = ')';

checkCancellation();

ad_utility::HashSet<std::string> rowSet;

std::string values = " { ";
for (size_t rowIndex = 0; rowIndex < siblingResult->idTable().size();
++rowIndex) {
std::string row = "(";
for (const auto& columnIdx : commonColumnIndices) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inide the for each row loop (the outer one) should probably be one cancellation check per iterationl

const auto& optionalString = ExportQueryExecutionTrees::idToStringAndType(
siblingTree_->getRootOperation()->getIndex(),
siblingResult->idTable()(rowIndex, columnIdx),
siblingResult->localVocab());

if (optionalString.has_value()) {
absl::StrAppend(&row, optionalString.value().first, " ");
}
}
row.back() = ')';

if (rowSet.contains(row)) {
continue;
}

rowSet.insert(row);
absl::StrAppend(&values, row, " ");

checkCancellation();
}

return "VALUES " + vars + values + "} . ";
}

// ____________________________________________________________________________
template <size_t I>
void Service::writeTsvResult(cppcoro::generator<std::string_view> tsvResult,
Expand Down
21 changes: 20 additions & 1 deletion src/engine/Service.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ class Service : public Operation {
// The function used to obtain the result from the remote endpoint.
GetTsvFunction getTsvFunction_;

// The siblingTree, used for SERVICE clause optimization.
std::shared_ptr<QueryExecutionTree> siblingTree_;

public:
// Construct from parsed Service clause.
//
Expand All @@ -54,14 +57,27 @@ class Service : public Operation {
// but in our tests (`ServiceTest`) we use a mock function that does not
// require a running `HttpServer`.
Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause,
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest);
GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest,
std::shared_ptr<QueryExecutionTree> siblingTree = nullptr);

// Set the siblingTree (subTree that will later be joined with the Result of
// the Service Operation), used to reduce the Service Queries Complexity.
void setSiblingTree(std::shared_ptr<QueryExecutionTree> siblingTree) {
siblingTree_ = siblingTree;
}

// Methods inherited from base class `Operation`.
std::string getDescriptor() const override;
size_t getResultWidth() const override;
std::vector<ColumnIndex> resultSortedOn() const override { return {}; }
float getMultiplicity(size_t col) override;

// Getters for testing.
const auto& getSiblingTree() const { return siblingTree_; }
const auto& getGraphPatternAsString() const {
return parsedServiceClause_.graphPatternAsString_;
}

private:
uint64_t getSizeEstimateBeforeLimit() override;

Expand All @@ -82,6 +98,9 @@ class Service : public Operation {
// Compute the result using `getTsvFunction_`.
Result computeResult([[maybe_unused]] bool requestLaziness) override;

// Get a VALUES clause that contains the values of the siblingTree's result.
std::optional<std::string> getSiblingValuesClause() const;
joka921 marked this conversation as resolved.
Show resolved Hide resolved

// Write the given TSV result to the given result object. The `I` is the width
// of the result table.
//
Expand Down
6 changes: 4 additions & 2 deletions src/engine/Values.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,10 @@ void Values::writeValues(IdTable* idTablePtr, LocalVocab* localVocab) {
std::vector<size_t> numLocalVocabPerColumn(idTable.numColumns());
for (auto& row : parsedValues_._values) {
for (size_t colIdx = 0; colIdx < idTable.numColumns(); colIdx++) {
TripleComponent& tc = row[colIdx];
Id id = std::move(tc).toValueId(getIndex().getVocab(), *localVocab);
const TripleComponent& tc = row[colIdx];
// TODO<joka921> We don't want to move, but also don't want to
// unconditionally copy.
Id id = TripleComponent{tc}.toValueId(getIndex().getVocab(), *localVocab);
idTable(rowIdx, colIdx) = id;
if (id.getDatatype() == Datatype::LocalVocabIndex) {
++numLocalVocabPerColumn[colIdx];
Expand Down
3 changes: 2 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@ inline auto& RuntimeParameters() {
30s}),
SizeT<"lazy-index-scan-max-size-materialization">{1'000'000},
Bool<"use-binsearch-transitive-path">{true},
Bool<"group-by-hash-map-enabled">{false}};
Bool<"group-by-hash-map-enabled">{false},
SizeT<"service-max-value-rows">{100}};
}();
return params;
}
Expand Down
26 changes: 26 additions & 0 deletions test/QueryPlannerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1062,3 +1062,29 @@ TEST(QueryPlanner, CancellationCancelsQueryPlanning) {
HasSubstr("Query planning"),
ad_utility::CancellationException);
}

// ___________________________________________________________________________
TEST(QueryPlanner, JoinWithService) {
auto scan = h::IndexScanFromStrings;

auto sibling = scan("?x", "<is-a>", "?y");

std::string_view graphPatternAsString = "{ ?x <is-a> ?z . }";

h::expect(
"SELECT * WHERE {"
"SERVICE <https://endpoint.com> { ?x <is-a> ?z . ?y <is-a> ?a . }}",
h::Service(std::nullopt, "{ ?x <is-a> ?z . ?y <is-a> ?a . }"));

h::expect(
"SELECT * WHERE { ?x <is-a> ?y ."
"SERVICE <https://endpoint.com> { ?x <is-a> ?z . }}",
h::UnorderedJoins(sibling, h::Service(sibling, graphPatternAsString)));

h::expect(
"SELECT * WHERE { ?x <is-a> ?y . "
"SERVICE <https://endpoint.com> { ?x <is-a> ?z . ?y <is-a> ?a . }}",
h::MultiColumnJoin(
sibling,
h::Sort(h::Service(sibling, "{ ?x <is-a> ?z . ?y <is-a> ?a . }"))));
}
18 changes: 18 additions & 0 deletions test/QueryPlannerTestHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "engine/OrderBy.h"
#include "engine/QueryExecutionTree.h"
#include "engine/QueryPlanner.h"
#include "engine/Service.h"
#include "engine/Sort.h"
#include "engine/TextIndexScanForEntity.h"
#include "engine/TextIndexScanForWord.h"
Expand Down Expand Up @@ -285,6 +286,23 @@ constexpr auto OrderBy = [](const ::OrderBy::SortedVariables& sortedVariables,
// Match a `UNION` operation.
constexpr auto Union = MatchTypeAndOrderedChildren<::Union>;

// Match a `SERVICE` operation.
constexpr auto Service = [](const std::optional<QetMatcher>& siblingMatcher,
std::string_view graphPatternAsString) {
const auto optSiblingMatcher =
[&]() -> Matcher<const std::shared_ptr<QueryExecutionTree>&> {
if (siblingMatcher.has_value()) {
return Pointee(siblingMatcher.value());
}
return IsNull();
}();

return RootOperation<::Service>(
AllOf(AD_PROPERTY(::Service, getSiblingTree, optSiblingMatcher),
AD_PROPERTY(::Service, getGraphPatternAsString,
Eq(graphPatternAsString))));
};

/// Parse the given SPARQL `query`, pass it to a `QueryPlanner` with empty
/// execution context, and return the resulting `QueryExecutionTree`
QueryExecutionTree parseAndPlan(std::string query, QueryExecutionContext* qec) {
Expand Down
Loading
Loading