diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index c96347057b..ae69961573 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -1642,6 +1642,13 @@ std::vector QueryPlanner::createJoinCandidates( return {makeSubtreePlan(_qec, a._qet, b._qet)}; } + // Check if one of the two Operations is a SERVICE. If so, we can try + // to simplify the Service Query using the result of the other operation. + if (auto opt = createJoinWithService(a, b, jcs)) { + candidates.push_back(std::move(opt.value())); + return candidates; + } + if (jcs.size() >= 2) { // If there are two or more join columns and we are not using the // TwoColumnJoin (the if part before this comment), use a multiColumnJoin. @@ -1770,6 +1777,35 @@ auto QueryPlanner::createJoinWithHasPredicateScan( return plan; } +// _____________________________________________________________________ +auto QueryPlanner::createJoinWithService( + SubtreePlan a, SubtreePlan b, + const std::vector>& jcs) + -> std::optional { + auto aRootOp = std::dynamic_pointer_cast(a._qet->getRootOperation()); + auto bRootOp = std::dynamic_pointer_cast(b._qet->getRootOperation()); + + // Exactly one of the two Operations can be a service. + if (static_cast(aRootOp) == static_cast(bRootOp)) { + return std::nullopt; + } + + auto service = aRootOp ? aRootOp : bRootOp; + auto sibling = bRootOp ? a : b; + + service->setSiblingTree(sibling._qet); + + const auto& qec = service->getExecutionContext(); + + SubtreePlan plan = + jcs.size() == 1 + ? makeSubtreePlan(qec, a._qet, b._qet, jcs[0][0], jcs[0][1]) + : makeSubtreePlan(qec, a._qet, b._qet); + mergeSubtreePlanIds(plan, a, b); + + return plan; +} + // _____________________________________________________________________ void QueryPlanner::QueryGraph::setupGraph( const std::vector& leafOperations) { diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index 1c44c02f5f..cff1059e64 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -329,6 +329,10 @@ class QueryPlanner { SubtreePlan a, SubtreePlan b, const std::vector>& jcs); + [[nodiscard]] static std::optional createJoinWithService( + SubtreePlan a, SubtreePlan b, + const std::vector>& jcs); + [[nodiscard]] vector getOrderByRow( const ParsedQuery& pq, const std::vector>& dpTab) const; diff --git a/src/engine/Service.cpp b/src/engine/Service.cpp index 71d1617a59..c4252a9a8f 100644 --- a/src/engine/Service.cpp +++ b/src/engine/Service.cpp @@ -5,11 +5,14 @@ #include "engine/Service.h" #include +#include #include #include "engine/CallFixedSize.h" +#include "engine/ExportQueryExecutionTrees.h" #include "engine/Values.h" #include "engine/VariableToColumnMap.h" +#include "global/RuntimeParameters.h" #include "parser/TokenizerCtre.h" #include "parser/TurtleParser.h" #include "util/Exception.h" @@ -21,10 +24,12 @@ // ____________________________________________________________________________ Service::Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause, - GetTsvFunction getTsvFunction) + GetTsvFunction getTsvFunction, + std::shared_ptr siblingTree) : Operation{qec}, parsedServiceClause_{std::move(parsedServiceClause)}, - getTsvFunction_{std::move(getTsvFunction)} {} + getTsvFunction_{std::move(getTsvFunction)}, + siblingTree_{std::move(siblingTree)} {} // ____________________________________________________________________________ std::string Service::getCacheKeyImpl() const { @@ -32,7 +37,11 @@ std::string Service::getCacheKeyImpl() const { // TODO: This duplicates code in GraphPatternOperation.cpp . os << "SERVICE " << parsedServiceClause_.serviceIri_.toSparql() << " {\n" << parsedServiceClause_.prologue_ << "\n" - << parsedServiceClause_.graphPatternAsString_ << "\n}\n"; + << parsedServiceClause_.graphPatternAsString_ << "\n"; + if (siblingTree_ != nullptr) { + os << siblingTree_->getRootOperation()->getCacheKey() << "\n"; + } + os << "}\n"; return std::move(os).str(); } @@ -92,6 +101,14 @@ Result Service::computeResult([[maybe_unused]] bool requestLaziness) { serviceIriString.remove_suffix(1); ad_utility::httpUtils::Url serviceUrl{serviceIriString}; + // Try to simplify the Service Query using it's sibling Operation. + if (auto valuesClause = getSiblingValuesClause(); valuesClause.has_value()) { + auto openBracketPos = parsedServiceClause_.graphPatternAsString_.find('{'); + parsedServiceClause_.graphPatternAsString_ = + "{\n" + valuesClause.value() + '\n' + + parsedServiceClause_.graphPatternAsString_.substr(openBracketPos + 1); + } + // Construct the query to be sent to the SPARQL endpoint. std::string variablesForSelectClause = absl::StrJoin( parsedServiceClause_.visibleVariables_, " ", Variable::AbslFormatter); @@ -159,6 +176,66 @@ Result Service::computeResult([[maybe_unused]] bool requestLaziness) { return {std::move(idTable), resultSortedOn(), std::move(localVocab)}; } +// ____________________________________________________________________________ +std::optional Service::getSiblingValuesClause() const { + if (siblingTree_ == nullptr) { + return std::nullopt; + } + + const auto& siblingResult = siblingTree_->getResult(); + if (siblingResult->idTable().size() > + RuntimeParameters().get<"service-max-value-rows">()) { + return std::nullopt; + } + + checkCancellation(); + + std::vector commonColumnIndices; + const auto& siblingVars = siblingTree_->getVariableColumns(); + std::string vars = "("; + for (const auto& localVar : parsedServiceClause_.visibleVariables_) { + auto it = siblingVars.find(localVar); + if (it == siblingVars.end()) { + continue; + } + absl::StrAppend(&vars, it->first.name(), " "); + commonColumnIndices.push_back(it->second.columnIndex_); + } + vars.back() = ')'; + + checkCancellation(); + + ad_utility::HashSet rowSet; + + std::string values = " { "; + for (size_t rowIndex = 0; rowIndex < siblingResult->idTable().size(); + ++rowIndex) { + std::string row = "("; + for (const auto& columnIdx : commonColumnIndices) { + const auto& optionalString = ExportQueryExecutionTrees::idToStringAndType( + siblingTree_->getRootOperation()->getIndex(), + siblingResult->idTable()(rowIndex, columnIdx), + siblingResult->localVocab()); + + if (optionalString.has_value()) { + absl::StrAppend(&row, optionalString.value().first, " "); + } + } + row.back() = ')'; + + if (rowSet.contains(row)) { + continue; + } + + rowSet.insert(row); + absl::StrAppend(&values, row, " "); + + checkCancellation(); + } + + return "VALUES " + vars + values + "} . "; +} + // ____________________________________________________________________________ template void Service::writeTsvResult(cppcoro::generator tsvResult, diff --git a/src/engine/Service.h b/src/engine/Service.h index 8a942bf145..9594b66b19 100644 --- a/src/engine/Service.h +++ b/src/engine/Service.h @@ -46,6 +46,9 @@ class Service : public Operation { // The function used to obtain the result from the remote endpoint. GetTsvFunction getTsvFunction_; + // The siblingTree, used for SERVICE clause optimization. + std::shared_ptr siblingTree_; + public: // Construct from parsed Service clause. // @@ -54,7 +57,14 @@ class Service : public Operation { // but in our tests (`ServiceTest`) we use a mock function that does not // require a running `HttpServer`. Service(QueryExecutionContext* qec, parsedQuery::Service parsedServiceClause, - GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest); + GetTsvFunction getTsvFunction = sendHttpOrHttpsRequest, + std::shared_ptr siblingTree = nullptr); + + // Set the siblingTree (subTree that will later be joined with the Result of + // the Service Operation), used to reduce the Service Queries Complexity. + void setSiblingTree(std::shared_ptr siblingTree) { + siblingTree_ = siblingTree; + } // Methods inherited from base class `Operation`. std::string getDescriptor() const override; @@ -62,6 +72,12 @@ class Service : public Operation { std::vector resultSortedOn() const override { return {}; } float getMultiplicity(size_t col) override; + // Getters for testing. + const auto& getSiblingTree() const { return siblingTree_; } + const auto& getGraphPatternAsString() const { + return parsedServiceClause_.graphPatternAsString_; + } + private: uint64_t getSizeEstimateBeforeLimit() override; @@ -82,6 +98,9 @@ class Service : public Operation { // Compute the result using `getTsvFunction_`. Result computeResult([[maybe_unused]] bool requestLaziness) override; + // Get a VALUES clause that contains the values of the siblingTree's result. + std::optional getSiblingValuesClause() const; + // Write the given TSV result to the given result object. The `I` is the width // of the result table. // diff --git a/src/engine/Values.cpp b/src/engine/Values.cpp index ec6451f3eb..5c61b3cfe7 100644 --- a/src/engine/Values.cpp +++ b/src/engine/Values.cpp @@ -130,8 +130,10 @@ void Values::writeValues(IdTable* idTablePtr, LocalVocab* localVocab) { std::vector numLocalVocabPerColumn(idTable.numColumns()); for (auto& row : parsedValues_._values) { for (size_t colIdx = 0; colIdx < idTable.numColumns(); colIdx++) { - TripleComponent& tc = row[colIdx]; - Id id = std::move(tc).toValueId(getIndex().getVocab(), *localVocab); + const TripleComponent& tc = row[colIdx]; + // TODO We don't want to move, but also don't want to + // unconditionally copy. + Id id = TripleComponent{tc}.toValueId(getIndex().getVocab(), *localVocab); idTable(rowIdx, colIdx) = id; if (id.getDatatype() == Datatype::LocalVocabIndex) { ++numLocalVocabPerColumn[colIdx]; diff --git a/src/global/RuntimeParameters.h b/src/global/RuntimeParameters.h index 5bea460cc9..1d23896032 100644 --- a/src/global/RuntimeParameters.h +++ b/src/global/RuntimeParameters.h @@ -46,7 +46,8 @@ inline auto& RuntimeParameters() { 30s}), SizeT<"lazy-index-scan-max-size-materialization">{1'000'000}, Bool<"use-binsearch-transitive-path">{true}, - Bool<"group-by-hash-map-enabled">{false}}; + Bool<"group-by-hash-map-enabled">{false}, + SizeT<"service-max-value-rows">{100}}; }(); return params; } diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index 6b950301d2..7451016f2a 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -1062,3 +1062,29 @@ TEST(QueryPlanner, CancellationCancelsQueryPlanning) { HasSubstr("Query planning"), ad_utility::CancellationException); } + +// ___________________________________________________________________________ +TEST(QueryPlanner, JoinWithService) { + auto scan = h::IndexScanFromStrings; + + auto sibling = scan("?x", "", "?y"); + + std::string_view graphPatternAsString = "{ ?x ?z . }"; + + h::expect( + "SELECT * WHERE {" + "SERVICE { ?x ?z . ?y ?a . }}", + h::Service(std::nullopt, "{ ?x ?z . ?y ?a . }")); + + h::expect( + "SELECT * WHERE { ?x ?y ." + "SERVICE { ?x ?z . }}", + h::UnorderedJoins(sibling, h::Service(sibling, graphPatternAsString))); + + h::expect( + "SELECT * WHERE { ?x ?y . " + "SERVICE { ?x ?z . ?y ?a . }}", + h::MultiColumnJoin( + sibling, + h::Sort(h::Service(sibling, "{ ?x ?z . ?y ?a . }")))); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index 28d5c9086a..c33f9a2faa 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -19,6 +19,7 @@ #include "engine/OrderBy.h" #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" +#include "engine/Service.h" #include "engine/Sort.h" #include "engine/TextIndexScanForEntity.h" #include "engine/TextIndexScanForWord.h" @@ -285,6 +286,23 @@ constexpr auto OrderBy = [](const ::OrderBy::SortedVariables& sortedVariables, // Match a `UNION` operation. constexpr auto Union = MatchTypeAndOrderedChildren<::Union>; +// Match a `SERVICE` operation. +constexpr auto Service = [](const std::optional& siblingMatcher, + std::string_view graphPatternAsString) { + const auto optSiblingMatcher = + [&]() -> Matcher&> { + if (siblingMatcher.has_value()) { + return Pointee(siblingMatcher.value()); + } + return IsNull(); + }(); + + return RootOperation<::Service>( + AllOf(AD_PROPERTY(::Service, getSiblingTree, optSiblingMatcher), + AD_PROPERTY(::Service, getGraphPatternAsString, + Eq(graphPatternAsString)))); +}; + /// Parse the given SPARQL `query`, pass it to a `QueryPlanner` with empty /// execution context, and return the resulting `QueryExecutionTree` QueryExecutionTree parseAndPlan(std::string query, QueryExecutionContext* qec) { diff --git a/test/ServiceTest.cpp b/test/ServiceTest.cpp index dc27f25a8b..161429154c 100644 --- a/test/ServiceTest.cpp +++ b/test/ServiceTest.cpp @@ -8,9 +8,11 @@ #include #include "engine/Service.h" +#include "global/RuntimeParameters.h" #include "parser/GraphPatternOperation.h" #include "util/IdTableHelpers.h" #include "util/IndexTestHelpers.h" +#include "util/TripleComponentTestHelpers.h" #include "util/http/HttpUtils.h" // Fixture that sets up a test index and a factory for producing mocks for the @@ -192,4 +194,100 @@ TEST_F(ServiceTest, computeResult) { IdTable expectedIdTable = makeIdTableFromVector( {{idX, idY}, {idBla, idBli}, {idBlu, idBla}, {idBli, idBlu}}); EXPECT_EQ(result->idTable(), expectedIdTable); + + // Check 5: When a siblingTree with variables common to the Service Clause is + // passed, the Service Operation shall use the siblings result to reduce + // its Query complexity by injecting them as Value Clause + auto iri = ad_utility::testing::iri; + using TC = TripleComponent; + auto siblingTree = std::make_shared( + testQec, + std::make_shared( + testQec, + (parsedQuery::SparqlValues){ + {Variable{"?x"}, Variable{"?y"}, Variable{"?z"}}, + {{TC(iri("")), TC(iri("")), TC(iri(""))}, + {TC(iri("")), TC(iri("")), TC(iri(""))}, + {TC(iri("")), TC(iri("")), TC(iri(""))}}})); + + auto parsedServiceClause5 = parsedServiceClause; + parsedServiceClause5.graphPatternAsString_ = + "{ ?x ?y . ?y ?z2 . }"; + parsedServiceClause5.visibleVariables_.emplace_back("?z2"); + + std::string_view expectedSparqlQuery5 = + "PREFIX doof: SELECT ?x ?y ?z2 " + "WHERE { VALUES (?x ?y) { ( ) ( ) } . ?x ?y . ?y " + " ?z2 . }"; + + Service serviceOperation5{ + testQec, parsedServiceClause5, + getTsvFunctionFactory(expectedUrl, expectedSparqlQuery5, + "?x\t?y\t?z2\n\t\t\n\t\t\n<" + "blu>\t\t\n\t\t\n"), + siblingTree}; + EXPECT_NO_THROW(serviceOperation5.getResult()); + + // Check 6: SiblingTree's rows exceed maxValue + const auto maxValueRowsDefault = + RuntimeParameters().get<"service-max-value-rows">(); + RuntimeParameters().set<"service-max-value-rows">(0); + testQec->getQueryTreeCache().clearAll(); + std::string_view expectedSparqlQuery6 = + "PREFIX doof: SELECT ?x ?y ?z2 " + "WHERE { ?x ?y . ?y ?z2 . }"; + Service serviceOperation6{ + testQec, parsedServiceClause5, + getTsvFunctionFactory(expectedUrl, expectedSparqlQuery6, + "?x\t?y\t?z2\n\t\t\n\t\t\n<" + "blu>\t\t\n\t\t\n"), + siblingTree}; + EXPECT_NO_THROW(serviceOperation6.getResult()); + RuntimeParameters().set<"service-max-value-rows">(maxValueRowsDefault); +} + +TEST_F(ServiceTest, getCacheKey) { + parsedQuery::Service parsedServiceClause{{Variable{"?x"}, Variable{"?y"}}, + Iri{""}, + "PREFIX doof: ", + "{ }"}; + + // The cacheKey of the Service Operation has to depend on the cacheKey of + // the siblingTree, as it might alter the Service Query. + + Service service( + testQec, parsedServiceClause, + getTsvFunctionFactory( + "http://localhorst:80/api", + "PREFIX doof: SELECT ?x ?y WHERE { }", + "?x\t?y\n\t\n\t\n\t\n\t\n")); + + auto ck_noSibling = service.getCacheKey(); + + auto iri = ad_utility::testing::iri; + using TC = TripleComponent; + auto siblingTree = std::make_shared( + testQec, + std::make_shared( + testQec, + (parsedQuery::SparqlValues){ + {Variable{"?x"}, Variable{"?y"}, Variable{"?z"}}, + {{TC(iri("")), TC(iri("")), TC(iri(""))}, + {TC(iri("")), TC(iri("")), TC(iri(""))}}})); + service.setSiblingTree(siblingTree); + + auto ck_sibling = service.getCacheKey(); + EXPECT_NE(ck_noSibling, ck_sibling); + + auto siblingTree2 = std::make_shared( + testQec, + std::make_shared( + testQec, (parsedQuery::SparqlValues){ + {Variable{"?x"}, Variable{"?y"}, Variable{"?z"}}, + {{TC(iri("")), TC(iri("")), TC(iri(""))}}})); + + service.setSiblingTree(siblingTree2); + + auto ck_changedSibling = service.getCacheKey(); + EXPECT_NE(ck_sibling, ck_changedSibling); }