Implement DESCRIBE (#1624)

Implement DESCRIBE according to the Concise Bounded Description (CBD) specification: https://www.w3.org/submissions/2005/SUBM-CBD-20050603 . That is, do not only show those triples where the subject is one of the resources to describe but recursively expand reification nodes. The current implementation recursively expands blank nodes. Here is an example query on Wikidata, where that makes a difference: https://qlever.cs.uni-freiburg.de/wikidata/obesyx In a future PR, add an option to configure which other nodes to consider for expansion. For example, for Wikidata, we would also like to expand all object IRIs that start with the prefix `http://www.wikidata.org/entity/statement/`. Co-authored-by: Hannah Bast <[email protected]>
ad-freiburg · Dec 14, 2024 · 27f4799 · 27f4799
1 parent 4237e0d
commit 27f4799
Show file tree

Hide file tree

Showing 23 changed files with 812 additions and 84 deletions.
diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt
@@ -13,5 +13,6 @@ add_library(engine
         VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
         CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
         TextLimit.cpp LazyGroupBy.cpp GroupByHashMapOptimization.cpp SpatialJoin.cpp
-        CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp)
+        CountConnectedSubgraphs.cpp SpatialJoinAlgorithms.cpp PathSearch.cpp ExecuteUpdate.cpp
+        Describe.cpp)
 qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams s2)
diff --git a/src/engine/CheckUsePatternTrick.cpp b/src/engine/CheckUsePatternTrick.cpp
@@ -72,9 +72,9 @@ bool isVariableContainedInGraphPatternOperation(
     } else if constexpr (std::is_same_v<T, p::Service>) {
       return ad_utility::contains(arg.visibleVariables_, variable);
     } else {
-      static_assert(std::is_same_v<T, p::TransPath> ||
-                    std::is_same_v<T, p::PathQuery> ||
-                    std::is_same_v<T, p::SpatialQuery>);
+      static_assert(
+          std::is_same_v<T, p::TransPath> || std::is_same_v<T, p::PathQuery> ||
+          std::is_same_v<T, p::Describe> || std::is_same_v<T, p::SpatialQuery>);
       // The `TransPath` is set up later in the query planning, when this
       // function should not be called anymore.
       AD_FAIL();

diff --git a/src/engine/Describe.cpp b/src/engine/Describe.cpp
@@ -0,0 +1,247 @@
+// Copyright 2024, University of Freiburg,
+// Chair of Algorithms and Data Structures
+// Author: Johannes Kalmbach <[email protected]>
+
+#include "engine/Describe.h"
+
+#include "../../test/engine/ValuesForTesting.h"
+#include "engine/IndexScan.h"
+#include "engine/Join.h"
+
+// _____________________________________________________________________________
+Describe::Describe(QueryExecutionContext* qec,
+                   std::shared_ptr<QueryExecutionTree> subtree,
+                   parsedQuery::Describe describe)
+    : Operation{qec},
+      subtree_{std::move(subtree)},
+      describe_{std::move(describe)} {
+  // If the DESCRIBE query has no WHERE clause, `subtree_` is the neutral
+  // element, but never `nullptr`.
+  AD_CORRECTNESS_CHECK(subtree_ != nullptr);
+}
+
+// _____________________________________________________________________________
+std::vector<QueryExecutionTree*> Describe::getChildren() {
+  return {subtree_.get()};
+}
+
+// _____________________________________________________________________________
+string Describe::getCacheKeyImpl() const {
+  // The cache key must represent the `resources_` (the variables and IRIs of
+  // the DESCRIBE clause) and the `subtree_` (the WHERE clause).
+  std::string result = absl::StrCat("DESCRIBE ", subtree_->getCacheKey(), " ");
+  for (const auto& resource : describe_.resources_) {
+    if (std::holds_alternative<TripleComponent::Iri>(resource)) {
+      result.append(
+          std::get<TripleComponent::Iri>(resource).toStringRepresentation());
+    } else {
+      result.append(absl::StrCat(
+          "column #",
+          subtree_->getVariableColumnOrNullopt(std::get<Variable>(resource))
+              .value_or(static_cast<size_t>(-1)),
+          " "));
+    }
+  }
+
+  // Add the names of the default graphs (from the FROM clauses) to the cache
+  // key, in a deterministic order.
+  //
+  // NOTE: The default and named graphs are also part of the cache key of the
+  // `subtree_`. However, the named graphs only determine the result for
+  // `subtree_` (the resources to be described), whereas the default graphs
+  // also determine which triples for these resources become part of the result.
+  const auto& defaultGraphs = describe_.datasetClauses_.defaultGraphs_;
+  if (defaultGraphs.has_value()) {
+    std::vector<std::string> graphIdVec;
+    std::ranges::transform(defaultGraphs.value(),
+                           std::back_inserter(graphIdVec),
+                           &TripleComponent::toRdfLiteral);
+    std::ranges::sort(graphIdVec);
+    absl::StrAppend(&result,
+                    "\nFiltered by Graphs:", absl::StrJoin(graphIdVec, " "));
+  }
+  return result;
+}
+
+// _____________________________________________________________________________
+string Describe::getDescriptor() const { return "DESCRIBE"; }
+
+// _____________________________________________________________________________
+size_t Describe::getResultWidth() const { return 3; }
+
+// As DESCRIBE is never part of the query planning (it is always the root
+// operation), we can return dummy values for the following functions.
+size_t Describe::getCostEstimate() { return 2 * subtree_->getCostEstimate(); }
+uint64_t Describe::getSizeEstimateBeforeLimit() {
+  return subtree_->getSizeEstimate() * 2;
+}
+float Describe::getMultiplicity([[maybe_unused]] size_t col) { return 1.0f; }
+bool Describe::knownEmptyResult() { return false; }
+
+// The result cannot easily be sorted, as it involves recursive expanding of
+// graphs.
+vector<ColumnIndex> Describe::resultSortedOn() const { return {}; }
+
+// The result always has three variables `?subject`, `?predicate`, `?object`.
+//
+// NOTE: These variable names are hardcoded in the implicit CONSTRUCT query
+// created in `SparqlQleverVisitor::visitDescribe`.
+VariableToColumnMap Describe::computeVariableToColumnMap() const {
+  using V = Variable;
+  auto col = makeAlwaysDefinedColumn;
+  return {{V("?subject"), col(0)},
+          {V("?predicate"), col(1)},
+          {V("?object"), col(2)}};
+}
+
+// A helper function for the recursive BFS. Return those `Id`s from `input` (an
+// `IdTable` with one column) that are blank nodes and not in `alreadySeen`,
+// with duplicates removed. The returned `Id`s are added to `alreadySeen`.
+static IdTable getNewBlankNodes(
+    const auto& allocator, ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen,
+    std::span<Id> input) {
+  IdTable result{1, allocator};
+  result.resize(input.size());
+  decltype(auto) resultColumn = result.getColumn(0);
+  size_t i = 0;
+  for (Id id : input) {
+    if (id.getDatatype() != Datatype::BlankNodeIndex) {
+      continue;
+    }
+    auto [it, isNew] = alreadySeen.emplace(id);
+    if (!isNew) {
+      continue;
+    }
+    resultColumn[i] = id;
+    ++i;
+  }
+  result.resize(i);
+  return result;
+}
+
+// _____________________________________________________________________________
+void Describe::recursivelyAddBlankNodes(
+    IdTable& finalResult, LocalVocab& localVocab,
+    ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen, IdTable blankNodes) {
+  AD_CORRECTNESS_CHECK(blankNodes.numColumns() == 1);
+
+  // If there are no more `blankNodes` to explore, we are done.
+  if (blankNodes.empty()) {
+    return;
+  }
+
+  // Expand the `blankNodes` by joining them with the full index and add the
+  // resulting triples to the `finalResult`.
+  //
+  // TODO<joka921> Make the result of DESCRIBE lazy, then we can avoid the
+  // additional copy here.
+  auto table =
+      makeAndExecuteJoinWithFullIndex(std::move(blankNodes), localVocab);
+  finalResult.insertAtEnd(table);
+
+  // Compute the set of newly found blank nodes and recurse.
+  auto newBlankNodes =
+      getNewBlankNodes(allocator(), alreadySeen, table.getColumn(2));
+  recursivelyAddBlankNodes(finalResult, localVocab, alreadySeen,
+                           std::move(newBlankNodes));
+}
+
+// _____________________________________________________________________________
+IdTable Describe::makeAndExecuteJoinWithFullIndex(
+    IdTable input, LocalVocab& localVocab) const {
+  AD_CORRECTNESS_CHECK(input.numColumns() == 1);
+
+  // Create a `Join` operation that joins `input` (with column `?subject`) with
+  // the full index (with columns `?subject`, `?predicate`, `?object`) on the
+  // `?subject` column.
+  using V = Variable;
+  auto subjectVar = V{"?subject"};
+  auto valuesOp = ad_utility::makeExecutionTree<ValuesForTesting>(
+      getExecutionContext(), std::move(input),
+      std::vector<std::optional<Variable>>{subjectVar});
+  SparqlTripleSimple triple{subjectVar, V{"?predicate"}, V{"?object"}};
+  auto indexScan = ad_utility::makeExecutionTree<IndexScan>(
+      getExecutionContext(), Permutation::SPO, triple,
+      describe_.datasetClauses_.defaultGraphs_);
+  auto joinColValues = valuesOp->getVariableColumn(subjectVar);
+  auto joinColScan = indexScan->getVariableColumn(subjectVar);
+  auto join = ad_utility::makeExecutionTree<Join>(
+      getExecutionContext(), std::move(valuesOp), std::move(indexScan),
+      joinColValues, joinColScan);
+
+  // Compute the result of the `join` and select the columns `?subject`,
+  // `?predicate`, `?object`.
+  //
+  // NOTE: Typically, the join result has already those exact columns, in which
+  // case the `selectColumns` operation is a no-op. Note sure when this is not
+  // the case, but better safe than sorry.
+  auto result = join->getResult();
+  IdTable resultTable = result->idTable().clone();
+  ColumnIndex s = join->getVariableColumn(V{"?subject"});
+  ColumnIndex p = join->getVariableColumn(V{"?predicate"});
+  ColumnIndex o = join->getVariableColumn(V{"?object"});
+  resultTable.setColumnSubset(std::vector{s, p, o});
+
+  // The `indexScan` might have added some delta triples with local vocab IDs,
+  // so make sure to merge them into the `localVocab`.
+  localVocab.mergeWith(std::span{&result->localVocab(), 1});
+
+  return resultTable;
+}
+
+// _____________________________________________________________________________
+IdTable Describe::getIdsToDescribe(const Result& result,
+                                   LocalVocab& localVocab) const {
+  // First collect the `Id`s in a hash set, in order to remove duplicates.
+  ad_utility::HashSetWithMemoryLimit<Id> idsToDescribe{allocator()};
+  const auto& vocab = getIndex().getVocab();
+  for (const auto& resource : describe_.resources_) {
+    if (std::holds_alternative<TripleComponent::Iri>(resource)) {
+      // For an IRI, add the corresponding ID to `idsToDescribe`.
+      idsToDescribe.insert(
+          TripleComponent{std::get<TripleComponent::Iri>(resource)}.toValueId(
+              vocab, localVocab));
+    } else {
+      // For a variable, add all IDs that match the variable in the `result` of
+      // the WHERE clause to `idsToDescribe`.
+      const auto& var = std::get<Variable>(resource);
+      auto column = subtree_->getVariableColumnOrNullopt(var);
+      if (!column.has_value()) {
+        continue;
+      }
+      for (Id id : result.idTable().getColumn(column.value())) {
+        idsToDescribe.insert(id);
+      }
+    }
+  }
+
+  // Copy the `Id`s from the hash set to an `IdTable`.
+  IdTable idsAsTable{1, allocator()};
+  idsAsTable.resize(idsToDescribe.size());
+  std::ranges::copy(idsToDescribe, idsAsTable.getColumn(0).begin());
+  return idsAsTable;
+}
+
+// _____________________________________________________________________________
+ProtoResult Describe::computeResult([[maybe_unused]] bool requestLaziness) {
+  LocalVocab localVocab;
+  // Compute the results of the WHERE clause and extract the `Id`s to describe.
+  //
+  // TODO<joka921> Would we benefit from computing `resultOfWhereClause` lazily?
+  // Probably not, because we have to deduplicate the whole input anyway.
+  auto resultOfWhereClause = subtree_->getResult();
+  auto idsAsTable = getIdsToDescribe(*resultOfWhereClause, localVocab);
+
+  // Get all triples with the `Id`s as subject.
+  auto resultTable =
+      makeAndExecuteJoinWithFullIndex(std::move(idsAsTable), localVocab);
+
+  // Recursively follow all blank nodes.
+  ad_utility::HashSetWithMemoryLimit<Id> alreadySeen{allocator()};
+  auto blankNodes =
+      getNewBlankNodes(allocator(), alreadySeen, resultTable.getColumn(2));
+  recursivelyAddBlankNodes(resultTable, localVocab, alreadySeen,
+                           std::move(blankNodes));
+
+  return {std::move(resultTable), resultSortedOn(), std::move(localVocab)};
+}
diff --git a/src/engine/Describe.h b/src/engine/Describe.h
@@ -0,0 +1,78 @@
+// Copyright 2024, University of Freiburg,
+// Chair of Algorithms and Data Structures
+// Author: Johannes Kalmbach <[email protected]>
+
+#pragma once
+
+#include "engine/Operation.h"
+#include "parser/GraphPatternOperation.h"
+
+// Operation for DESCRIBE queries according to the Concise Bounded Description
+// (CBD) specification: https://www.w3.org/submissions/2005/SUBM-CBD-20050603 .
+//
+// NOTE: The current implementation recursively expands blank nodes. This can
+// be expanded to other reification schemes relatively easily (for example,
+// for Wikidata, also expand all object IRIs that start with the prefix
+// `http://www.wikidata.org/entity/statement/`).
+class Describe : public Operation {
+ private:
+  // The query execution tree for computing the WHERE clause of the DESCRIBE.
+  // Must be the neutral element if the DESCRIBE query has no WHERE clause.
+  std::shared_ptr<QueryExecutionTree> subtree_;
+
+  // The specification of the DESCRIBE clause.
+  parsedQuery::Describe describe_;
+
+ public:
+  // Create a new DESCRIBE operation.
+  Describe(QueryExecutionContext* qec,
+           std::shared_ptr<QueryExecutionTree> subtree,
+           parsedQuery::Describe describe);
+
+  // Getter for testing.
+  const auto& getDescribe() const { return describe_; }
+
+  // The following functions override those from the base class `Operation`.
+  std::vector<QueryExecutionTree*> getChildren() override;
+  string getCacheKeyImpl() const override;
+  string getDescriptor() const override;
+  size_t getResultWidth() const override;
+  size_t getCostEstimate() override;
+
+ private:
+  uint64_t getSizeEstimateBeforeLimit() override;
+
+ public:
+  float getMultiplicity(size_t col) override;
+  bool knownEmptyResult() override;
+
+ private:
+  [[nodiscard]] vector<ColumnIndex> resultSortedOn() const override;
+  ProtoResult computeResult(bool requestLaziness) override;
+  VariableToColumnMap computeVariableToColumnMap() const override;
+
+  // Add all triples where the subject is one of the `blankNodes` (an `IdTable`
+  // with one column) to the `finalResult`. Recursively continue for all newly
+  // found blank nodes (objects of the newly found triples, which are not
+  // contained in `alreadySeen`). This is a recursive implementation of
+  // breadth-first-search (BFS) where `blankNodes` is the set of start nodes,
+  // and `alreadySeen` is the set of nodes which have already been explored,
+  // which is needed to handle cycles in the graph.
+  void recursivelyAddBlankNodes(
+      IdTable& finalResult, LocalVocab& localVocab,
+      ad_utility::HashSetWithMemoryLimit<Id>& alreadySeen, IdTable blankNodes);
+
+  // Join the `input` (an `IdTable` with one column) with the full index on the
+  // subject column. The result has three columns: the subject, predicate, and
+  // object of each triple, where the subject is contained in `input`. This
+  // includes delta triples with local vocab IDs, which are added to the
+  // `localVocab`.
+  IdTable makeAndExecuteJoinWithFullIndex(IdTable input,
+                                          LocalVocab& localVocab) const;
+
+  // Get the set of (unique) IDs that match one of the variables or IRIs in
+  // the DESCRIBE clause and the `result` of the WHERE clause. For example, if
+  // the query is `DESCRIBE <x> ?y WHERE { ?y <p> <o>}`, return `<x>` and all
+  // IRIs that match `?y` in the WHERE clause, with all duplicates removed.
+  IdTable getIdsToDescribe(const Result& result, LocalVocab& localVocab) const;
+};
diff --git a/src/engine/Operation.h b/src/engine/Operation.h
@@ -1,8 +1,7 @@
-// Copyright 2015, University of Freiburg,
-// Chair of Algorithms and Data Structures.
-// Author:
-//   2015-2017 Björn Buchhold ([email protected])
-//   2018-     Johannes Kalmbach ([email protected])
+// Copyright 2015 - 2024, University of Freiburg
+// Chair of Algorithms and Data Structures
+// Authors: Björn Buchhold <[email protected]>    [2015 - 2017]
+//          Johannes Kalmbach <[email protected]> [2018 - 2024]
 
 #pragma once
 

diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp
@@ -19,6 +19,7 @@
 #include "engine/CheckUsePatternTrick.h"
 #include "engine/CountAvailablePredicates.h"
 #include "engine/CountConnectedSubgraphs.h"
+#include "engine/Describe.h"
 #include "engine/Distinct.h"
 #include "engine/Filter.h"
 #include "engine/GroupBy.h"
@@ -2349,6 +2350,8 @@ void QueryPlanner::GraphPatternPlanner::graphPatternOperationVisitor(Arg& arg) {
     visitGroupOptionalOrMinus(std::move(candidates));
   } else if constexpr (std::is_same_v<T, p::PathQuery>) {
     visitPathSearch(arg);
+  } else if constexpr (std::is_same_v<T, p::Describe>) {
+    visitDescribe(arg);
   } else if constexpr (std::is_same_v<T, p::SpatialQuery>) {
     visitSpatialSearch(arg);
   } else {
@@ -2587,3 +2590,14 @@ void QueryPlanner::GraphPatternPlanner::optimizeCommutatively() {
   candidatePlans_.push_back(std::move(lastRow));
   planner_.checkCancellation();
 }
+
+// _______________________________________________________________
+void QueryPlanner::GraphPatternPlanner::visitDescribe(
+    parsedQuery::Describe& describe) {
+  auto tree = std::make_shared<QueryExecutionTree>(
+      planner_.createExecutionTree(describe.whereClause_.get(), true));
+  auto describeOp =
+      makeSubtreePlan<Describe>(planner_._qec, std::move(tree), describe);
+  candidatePlans_.push_back(std::vector{std::move(describeOp)});
+  planner_.checkCancellation();
+}