Skip to content

Commit

Permalink
Implement DESCRIBE (#1624)
Browse files Browse the repository at this point in the history
Implement DESCRIBE according to the Concise Bounded Description (CBD) specification: https://www.w3.org/submissions/2005/SUBM-CBD-20050603 . That is, do not only show those triples where the subject is one of the resources to describe but recursively expand reification nodes. The current implementation recursively expands blank nodes. Here is an example query on Wikidata, where that makes a difference: https://qlever.cs.uni-freiburg.de/wikidata/obesyx

In a future PR, add an option to configure which other nodes to consider for expansion. For example, for Wikidata, we would also like to expand all object IRIs that start with the prefix `http://www.wikidata.org/entity/statement/`.

Co-authored-by: Hannah Bast <[email protected]>
  • Loading branch information
joka921 and Hannah Bast authored Dec 14, 2024
1 parent 4237e0d commit 27f4799
Show file tree
Hide file tree
Showing 23 changed files with 812 additions and 84 deletions.
3 changes: 2 additions & 1 deletion src/engine/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@ add_library(engine
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp)
CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp
Describe.cpp)
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
6 changes: 3 additions & 3 deletions src/engine/CheckUsePatternTrick.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,9 @@ bool isVariableContainedInGraphPatternOperation(
} else if constexpr (std::is_same_v<T, p::Service>) {
return ad_utility::contains(arg.visibleVariables_, variable);
} else {
static_assert(std::is_same_v<T, p::TransPath> ||
std::is_same_v<T, p::PathQuery> ||
std::is_same_v<T, p::SpatialQuery>);
static_assert(
std::is_same_v<T, p::TransPath> || std::is_same_v<T, p::PathQuery> ||
std::is_same_v<T, p::Describe> || std::is_same_v<T, p::SpatialQuery>);
// The `TransPath` is set up later in the query planning, when this
// function should not be called anymore.
AD_FAIL();
Expand Down
247 changes: 247 additions & 0 deletions src/engine/Describe.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures
// Author: Johannes Kalmbach <[email protected]>

#include "engine/Describe.h"

#include "../../test/engine/ValuesForTesting.h"
#include "engine/IndexScan.h"
#include "engine/Join.h"

// _____________________________________________________________________________
Describe::Describe(QueryExecutionContext* qec,
std::shared_ptr<QueryExecutionTree> subtree,
parsedQuery::Describe describe)
: Operation{qec},
subtree_{std::move(subtree)},
describe_{std::move(describe)} {
// If the DESCRIBE query has no WHERE clause, `subtree_` is the neutral
// element, but never `nullptr`.
AD_CORRECTNESS_CHECK(subtree_ != nullptr);
}

// _____________________________________________________________________________
std::vector<QueryExecutionTree*> Describe::getChildren() {
return {subtree_.get()};
}

// _____________________________________________________________________________
string Describe::getCacheKeyImpl() const {
// The cache key must represent the `resources_` (the variables and IRIs of
// the DESCRIBE clause) and the `subtree_` (the WHERE clause).
std::string result = absl::StrCat("DESCRIBE ", subtree_->getCacheKey(), " ");
for (const auto& resource : describe_.resources_) {
if (std::holds_alternative<TripleComponent::Iri>(resource)) {
result.append(
std::get<TripleComponent::Iri>(resource).toStringRepresentation());
} else {
result.append(absl::StrCat(
"column #",
subtree_->getVariableColumnOrNullopt(std::get<Variable>(resource))
.value_or(static_cast<size_t>(-1)),
" "));
}
}

// Add the names of the default graphs (from the FROM clauses) to the cache
// key, in a deterministic order.
//
// NOTE: The default and named graphs are also part of the cache key of the
// `subtree_`. However, the named graphs only determine the result for
// `subtree_` (the resources to be described), whereas the default graphs
// also determine which triples for these resources become part of the result.
const auto& defaultGraphs = describe_.datasetClauses_.defaultGraphs_;
if (defaultGraphs.has_value()) {
std::vector<std::string> graphIdVec;
std::ranges::transform(defaultGraphs.value(),
std::back_inserter(graphIdVec),
&TripleComponent::toRdfLiteral);
std::ranges::sort(graphIdVec);
absl::StrAppend(&result,
"\nFiltered by Graphs:", absl::StrJoin(graphIdVec, " "));
}
return result;
}

// _____________________________________________________________________________
string Describe::getDescriptor() const { return "DESCRIBE"; }

// _____________________________________________________________________________
size_t Describe::getResultWidth() const { return 3; }

// As DESCRIBE is never part of the query planning (it is always the root
// operation), we can return dummy values for the following functions.
size_t Describe::getCostEstimate() { return 2 * subtree_->getCostEstimate(); }
uint64_t Describe::getSizeEstimateBeforeLimit() {
return subtree_->getSizeEstimate() * 2;
}
float Describe::getMultiplicity([[maybe_unused]] size_t col) { return 1.0f; }
bool Describe::knownEmptyResult() { return false; }

// The result cannot easily be sorted, as it involves recursive expanding of
// graphs.
vector<ColumnIndex> Describe::resultSortedOn() const { return {}; }

// The result always has three variables `?subject`, `?predicate`, `?object`.
//
// NOTE: These variable names are hardcoded in the implicit CONSTRUCT query
// created in `SparqlQleverVisitor::visitDescribe`.
VariableToColumnMap Describe::computeVariableToColumnMap() const {
using V = Variable;
auto col = makeAlwaysDefinedColumn;
return {{V("?subject"), col(0)},
{V("?predicate"), col(1)},
{V("?object"), col(2)}};
}

// A helper function for the recursive BFS. Return those `Id`s from `input` (an
// `IdTable` with one column) that are blank nodes and not in `alreadySeen`,
// with duplicates removed. The returned `Id`s are added to `alreadySeen`.
static IdTable getNewBlankNodes(
const auto& allocator, ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen,
std::span<Id> input) {
IdTable result{1, allocator};
result.resize(input.size());
decltype(auto) resultColumn = result.getColumn(0);
size_t i = 0;
for (Id id : input) {
if (id.getDatatype() != Datatype::BlankNodeIndex) {
continue;
}
auto [it, isNew] = alreadySeen.emplace(id);
if (!isNew) {
continue;
}
resultColumn[i] = id;
++i;
}
result.resize(i);
return result;
}

// _____________________________________________________________________________
void Describe::recursivelyAddBlankNodes(
IdTable& finalResult, LocalVocab& localVocab,
ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen, IdTable blankNodes) {
AD_CORRECTNESS_CHECK(blankNodes.numColumns() == 1);

// If there are no more `blankNodes` to explore, we are done.
if (blankNodes.empty()) {
return;
}

// Expand the `blankNodes` by joining them with the full index and add the
// resulting triples to the `finalResult`.
//
// TODO<joka921> Make the result of DESCRIBE lazy, then we can avoid the
// additional copy here.
auto table =
makeAndExecuteJoinWithFullIndex(std::move(blankNodes), localVocab);
finalResult.insertAtEnd(table);

// Compute the set of newly found blank nodes and recurse.
auto newBlankNodes =
getNewBlankNodes(allocator(), alreadySeen, table.getColumn(2));
recursivelyAddBlankNodes(finalResult, localVocab, alreadySeen,
std::move(newBlankNodes));
}

// _____________________________________________________________________________
IdTable Describe::makeAndExecuteJoinWithFullIndex(
IdTable input, LocalVocab& localVocab) const {
AD_CORRECTNESS_CHECK(input.numColumns() == 1);

// Create a `Join` operation that joins `input` (with column `?subject`) with
// the full index (with columns `?subject`, `?predicate`, `?object`) on the
// `?subject` column.
using V = Variable;
auto subjectVar = V{"?subject"};
auto valuesOp = ad_utility::makeExecutionTree<ValuesForTesting>(
getExecutionContext(), std::move(input),
std::vector<std::optional<Variable>>{subjectVar});
SparqlTripleSimple triple{subjectVar, V{"?predicate"}, V{"?object"}};
auto indexScan = ad_utility::makeExecutionTree<IndexScan>(
getExecutionContext(), Permutation::SPO, triple,
describe_.datasetClauses_.defaultGraphs_);
auto joinColValues = valuesOp->getVariableColumn(subjectVar);
auto joinColScan = indexScan->getVariableColumn(subjectVar);
auto join = ad_utility::makeExecutionTree<Join>(
getExecutionContext(), std::move(valuesOp), std::move(indexScan),
joinColValues, joinColScan);

// Compute the result of the `join` and select the columns `?subject`,
// `?predicate`, `?object`.
//
// NOTE: Typically, the join result has already those exact columns, in which
// case the `selectColumns` operation is a no-op. Note sure when this is not
// the case, but better safe than sorry.
auto result = join->getResult();
IdTable resultTable = result->idTable().clone();
ColumnIndex s = join->getVariableColumn(V{"?subject"});
ColumnIndex p = join->getVariableColumn(V{"?predicate"});
ColumnIndex o = join->getVariableColumn(V{"?object"});
resultTable.setColumnSubset(std::vector{s, p, o});

// The `indexScan` might have added some delta triples with local vocab IDs,
// so make sure to merge them into the `localVocab`.
localVocab.mergeWith(std::span{&result->localVocab(), 1});

return resultTable;
}

// _____________________________________________________________________________
IdTable Describe::getIdsToDescribe(const Result& result,
LocalVocab& localVocab) const {
// First collect the `Id`s in a hash set, in order to remove duplicates.
ad_utility::HashSetWithMemoryLimit<Id> idsToDescribe{allocator()};
const auto& vocab = getIndex().getVocab();
for (const auto& resource : describe_.resources_) {
if (std::holds_alternative<TripleComponent::Iri>(resource)) {
// For an IRI, add the corresponding ID to `idsToDescribe`.
idsToDescribe.insert(
TripleComponent{std::get<TripleComponent::Iri>(resource)}.toValueId(
vocab, localVocab));
} else {
// For a variable, add all IDs that match the variable in the `result` of
// the WHERE clause to `idsToDescribe`.
const auto& var = std::get<Variable>(resource);
auto column = subtree_->getVariableColumnOrNullopt(var);
if (!column.has_value()) {
continue;
}
for (Id id : result.idTable().getColumn(column.value())) {
idsToDescribe.insert(id);
}
}
}

// Copy the `Id`s from the hash set to an `IdTable`.
IdTable idsAsTable{1, allocator()};
idsAsTable.resize(idsToDescribe.size());
std::ranges::copy(idsToDescribe, idsAsTable.getColumn(0).begin());
return idsAsTable;
}

// _____________________________________________________________________________
ProtoResult Describe::computeResult([[maybe_unused]] bool requestLaziness) {
LocalVocab localVocab;
// Compute the results of the WHERE clause and extract the `Id`s to describe.
//
// TODO<joka921> Would we benefit from computing `resultOfWhereClause` lazily?
// Probably not, because we have to deduplicate the whole input anyway.
auto resultOfWhereClause = subtree_->getResult();
auto idsAsTable = getIdsToDescribe(*resultOfWhereClause, localVocab);

// Get all triples with the `Id`s as subject.
auto resultTable =
makeAndExecuteJoinWithFullIndex(std::move(idsAsTable), localVocab);

// Recursively follow all blank nodes.
ad_utility::HashSetWithMemoryLimit<Id> alreadySeen{allocator()};
auto blankNodes =
getNewBlankNodes(allocator(), alreadySeen, resultTable.getColumn(2));
recursivelyAddBlankNodes(resultTable, localVocab, alreadySeen,
std::move(blankNodes));

return {std::move(resultTable), resultSortedOn(), std::move(localVocab)};
}
78 changes: 78 additions & 0 deletions src/engine/Describe.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures
// Author: Johannes Kalmbach <[email protected]>

#pragma once

#include "engine/Operation.h"
#include "parser/GraphPatternOperation.h"

// Operation for DESCRIBE queries according to the Concise Bounded Description
// (CBD) specification: https://www.w3.org/submissions/2005/SUBM-CBD-20050603 .
//
// NOTE: The current implementation recursively expands blank nodes. This can
// be expanded to other reification schemes relatively easily (for example,
// for Wikidata, also expand all object IRIs that start with the prefix
// `http://www.wikidata.org/entity/statement/`).
class Describe : public Operation {
private:
// The query execution tree for computing the WHERE clause of the DESCRIBE.
// Must be the neutral element if the DESCRIBE query has no WHERE clause.
std::shared_ptr<QueryExecutionTree> subtree_;

// The specification of the DESCRIBE clause.
parsedQuery::Describe describe_;

public:
// Create a new DESCRIBE operation.
Describe(QueryExecutionContext* qec,
std::shared_ptr<QueryExecutionTree> subtree,
parsedQuery::Describe describe);

// Getter for testing.
const auto& getDescribe() const { return describe_; }

// The following functions override those from the base class `Operation`.
std::vector<QueryExecutionTree*> getChildren() override;
string getCacheKeyImpl() const override;
string getDescriptor() const override;
size_t getResultWidth() const override;
size_t getCostEstimate() override;

private:
uint64_t getSizeEstimateBeforeLimit() override;

public:
float getMultiplicity(size_t col) override;
bool knownEmptyResult() override;

private:
[[nodiscard]] vector<ColumnIndex> resultSortedOn() const override;
ProtoResult computeResult(bool requestLaziness) override;
VariableToColumnMap computeVariableToColumnMap() const override;

// Add all triples where the subject is one of the `blankNodes` (an `IdTable`
// with one column) to the `finalResult`. Recursively continue for all newly
// found blank nodes (objects of the newly found triples, which are not
// contained in `alreadySeen`). This is a recursive implementation of
// breadth-first-search (BFS) where `blankNodes` is the set of start nodes,
// and `alreadySeen` is the set of nodes which have already been explored,
// which is needed to handle cycles in the graph.
void recursivelyAddBlankNodes(
IdTable& finalResult, LocalVocab& localVocab,
ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen, IdTable blankNodes);

// Join the `input` (an `IdTable` with one column) with the full index on the
// subject column. The result has three columns: the subject, predicate, and
// object of each triple, where the subject is contained in `input`. This
// includes delta triples with local vocab IDs, which are added to the
// `localVocab`.
IdTable makeAndExecuteJoinWithFullIndex(IdTable input,
LocalVocab& localVocab) const;

// Get the set of (unique) IDs that match one of the variables or IRIs in
// the DESCRIBE clause and the `result` of the WHERE clause. For example, if
// the query is `DESCRIBE <x> ?y WHERE { ?y <p> <o>}`, return `<x>` and all
// IRIs that match `?y` in the WHERE clause, with all duplicates removed.
IdTable getIdsToDescribe(const Result& result, LocalVocab& localVocab) const;
};
9 changes: 4 additions & 5 deletions src/engine/Operation.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
// Copyright 2015, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author:
// 2015-2017 Björn Buchhold ([email protected])
// 2018- Johannes Kalmbach ([email protected])
// Copyright 2015 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Authors: Björn Buchhold <[email protected]> [2015 - 2017]
// Johannes Kalmbach <[email protected]> [2018 - 2024]

#pragma once

Expand Down
14 changes: 14 additions & 0 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "engine/CheckUsePatternTrick.h"
#include "engine/CountAvailablePredicates.h"
#include "engine/CountConnectedSubgraphs.h"
#include "engine/Describe.h"
#include "engine/Distinct.h"
#include "engine/Filter.h"
#include "engine/GroupBy.h"
Expand Down Expand Up @@ -2349,6 +2350,8 @@ void QueryPlanner::GraphPatternPlanner::graphPatternOperationVisitor(Arg& arg) {
visitGroupOptionalOrMinus(std::move(candidates));
} else if constexpr (std::is_same_v<T, p::PathQuery>) {
visitPathSearch(arg);
} else if constexpr (std::is_same_v<T, p::Describe>) {
visitDescribe(arg);
} else if constexpr (std::is_same_v<T, p::SpatialQuery>) {
visitSpatialSearch(arg);
} else {
Expand Down Expand Up @@ -2587,3 +2590,14 @@ void QueryPlanner::GraphPatternPlanner::optimizeCommutatively() {
candidatePlans_.push_back(std::move(lastRow));
planner_.checkCancellation();
}

// _______________________________________________________________
void QueryPlanner::GraphPatternPlanner::visitDescribe(
parsedQuery::Describe& describe) {
auto tree = std::make_shared<QueryExecutionTree>(
planner_.createExecutionTree(describe.whereClause_.get(), true));
auto describeOp =
makeSubtreePlan<Describe>(planner_._qec, std::move(tree), describe);
candidatePlans_.push_back(std::vector{std::move(describeOp)});
planner_.checkCancellation();
}
Loading

0 comments on commit 27f4799

Please sign in to comment.