Skip to content

Commit

Permalink
Own type TripleComponent::Literal instead of std::string (#913)
Browse files Browse the repository at this point in the history
So far, our class `TripleComponent` stored a parsed literal as a `std::string`. The class already had own types for `Variable`, `UNDEF`, and double or integer values. Now it also has an own type `Literal` (which is is implemented using a new class `NormalizedRDFString`). It remains to also add own types for IRIs and blank nodes.
  • Loading branch information
joka921 authored Mar 15, 2023
1 parent 7539be6 commit bf5c70a
Show file tree
Hide file tree
Showing 26 changed files with 558 additions and 320 deletions.
63 changes: 34 additions & 29 deletions src/engine/QueryPlanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,9 +1160,9 @@ QueryPlanner::SubtreePlan QueryPlanner::getTextLeafPlan(
SubtreePlan plan(_qec);
plan._idsOfIncludedNodes |= (size_t(1) << node._id);
auto& tree = *plan._qet;
AD_CONTRACT_CHECK(node._wordPart.size() > 0);
AD_CONTRACT_CHECK(node._wordPart.has_value());
auto textOp = std::make_shared<TextOperationWithoutFilter>(
_qec, node._wordPart, node._variables, node._cvar.value());
_qec, node._wordPart.value(), node._variables, node._cvar.value());
tree.setOperation(QueryExecutionTree::OperationType::TEXT_WITHOUT_FILTER,
textOp);
return plan;
Expand Down Expand Up @@ -1284,7 +1284,8 @@ string QueryPlanner::TripleGraph::asString() const {
} else {
os << i << " {TextOP for "
<< _nodeMap.find(i)->second->_cvar.value().name() << ", wordPart: \""
<< _nodeMap.find(i)->second->_wordPart << "\"} : (";
<< absl::StrJoin(_nodeMap.find(i)->second->_wordPart.value(), " ")
<< "\"} : (";
}

for (size_t j = 0; j < _adjLists[i].size(); ++j) {
Expand Down Expand Up @@ -1487,12 +1488,10 @@ vector<vector<QueryPlanner::SubtreePlan>> QueryPlanner::fillDpTab(

// _____________________________________________________________________________
bool QueryPlanner::TripleGraph::isTextNode(size_t i) const {
return _nodeMap.count(i) > 0 && (_nodeMap.find(i)->second->_triple._p._iri ==
CONTAINS_ENTITY_PREDICATE ||
_nodeMap.find(i)->second->_triple._p._iri ==
CONTAINS_WORD_PREDICATE ||
_nodeMap.find(i)->second->_triple._p._iri ==
INTERNAL_TEXT_MATCH_PREDICATE);
return _nodeMap.count(i) > 0 &&
(_nodeMap.find(i)->second->_triple._p._iri ==
CONTAINS_ENTITY_PREDICATE ||
_nodeMap.find(i)->second->_triple._p._iri == CONTAINS_WORD_PREDICATE);
}

// _____________________________________________________________________________
Expand Down Expand Up @@ -1697,12 +1696,23 @@ QueryPlanner::TripleGraph& QueryPlanner::TripleGraph::operator=(
QueryPlanner::TripleGraph::TripleGraph()
: _adjLists(), _nodeMap(), _nodeStorage() {}

// ___________________________________________________________________________
namespace {

// Remove the quotation marks around an enquoted literal and convert it to lower
// case. This is only used in the `collapseTextCliques` function.
string stripAndLowercaseLiteral(std::string_view lit) {
AD_CORRECTNESS_CHECK(lit.size() >= 2 && lit.starts_with('"') &&
lit.ends_with('"'));
lit.remove_prefix(1);
lit.remove_suffix(1);
return ad_utility::getLowercaseUtf8(lit);
}
} // namespace

// _____________________________________________________________________________
void QueryPlanner::TripleGraph::collapseTextCliques() {
// TODO: Could use more refactoring.
// In Earlier versions there were no ql:contains... predicates but
// a symmetric <in-text> predicate. Therefore some parts are still more
// complex than need be.

// Create a map from context var to triples it occurs in (the cliques).
ad_utility::HashMap<Variable, vector<size_t>> cvarsToTextNodes(
Expand All @@ -1718,32 +1728,27 @@ void QueryPlanner::TripleGraph::collapseTextCliques() {
vector<std::set<size_t>> tnAdjSetsToOldIds;
for (auto& cvarsToTextNode : cvarsToTextNodes) {
auto& cvar = cvarsToTextNode.first;
string wordPart;
std::vector<string> words;
vector<SparqlTriple> trips;
tnAdjSetsToOldIds.push_back(std::set<size_t>());
tnAdjSetsToOldIds.emplace_back();
auto& adjNodes = tnAdjSetsToOldIds.back();
for (auto nid : cvarsToTextNode.second) {
removedNodeIds[nid] = id;
adjNodes.insert(_adjLists[nid].begin(), _adjLists[nid].end());
auto& triple = _nodeMap[nid]->_triple;
trips.push_back(triple);
if (triple._s == cvar && triple._o.isString() &&
!triple._o.isVariable()) {
if (!wordPart.empty()) {
wordPart += " ";
}
wordPart += triple._o.getString();
}
// TODO<joka921> Figure out what is going on here... The subject and the
// object of a triple are being combined into a string as it seems.
if (triple._o == cvar && !isVariable(triple._s)) {
if (!wordPart.empty()) {
wordPart += " ";
}
wordPart += triple._s.toString();
// TODO<joka921> I think the check "is the predicate ql:contains_word" is
// missing. Verify this.
if (triple._s == cvar && triple._o.isLiteral()) {
std::vector<std::string> newWords = absl::StrSplit(
stripAndLowercaseLiteral(
triple._o.getLiteral().normalizedLiteralContent().get()),
' ');
words.insert(words.end(), newWords.begin(), newWords.end());
}
}
textNodes.emplace_back(Node(id++, cvar, wordPart, trips));
textNodes.emplace_back(id, cvar, std::move(words), trips);
++id;
assert(tnAdjSetsToOldIds.size() == id);
}

Expand Down
39 changes: 19 additions & 20 deletions src/engine/QueryPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <set>
#include <vector>

#include "absl/strings/str_join.h"
#include "absl/strings/str_split.h"
#include "engine/CheckUsePatternTrick.h"
#include "engine/Filter.h"
#include "engine/QueryExecutionTree.h"
Expand All @@ -34,34 +36,30 @@ class QueryPlanner {
TripleGraph(const TripleGraph& other, vector<size_t> keepNodes);

struct Node {
Node(size_t id, const SparqlTriple& t)
// TODO<joka921> should the `_cvar` be an `optional` or a `variant`.
: _id(id),
_triple(t),
_variables(),
_cvar(std::nullopt),
_wordPart() {
if (isVariable(t._s)) {
_variables.insert(t._s.getVariable());
Node(size_t id, SparqlTriple t) : _id(id), _triple(std::move(t)) {
if (isVariable(_triple._s)) {
_variables.insert(_triple._s.getVariable());
}
if (isVariable(t._p)) {
_variables.insert(Variable{t._p._iri});
if (isVariable(_triple._p)) {
_variables.insert(Variable{_triple._p._iri});
}
if (isVariable(t._o)) {
_variables.insert(t._o.getVariable());
if (isVariable(_triple._o)) {
_variables.insert(_triple._o.getVariable());
}
}

Node(size_t id, const Variable& cvar, const string& wordPart,
Node(size_t id, const Variable& cvar, std::vector<std::string> words,
const vector<SparqlTriple>& trips)
: _id(id),
// TODO<joka921> What is this triple used for? If it is just a
// dummy, then we can replace it by a `variant<Triple,
// TextNodeData>`.
_triple(cvar,
PropertyPath(PropertyPath::Operation::IRI, 0,
INTERNAL_TEXT_MATCH_PREDICATE, {}),
wordPart),
_variables(),
TripleComponent::UNDEF{}),
_cvar(cvar),
_wordPart(wordPart) {
_wordPart(std::move(words)) {
_variables.insert(cvar);
for (const auto& t : trips) {
if (isVariable(t._s)) {
Expand Down Expand Up @@ -97,7 +95,7 @@ class QueryPlanner {
// together?
if (n._cvar.has_value()) {
out << " cvar " << n._cvar.value().name() << " wordPart "
<< n._wordPart;
<< absl::StrJoin(n._wordPart.value(), " ");
}
return out;
}
Expand All @@ -106,11 +104,12 @@ class QueryPlanner {
SparqlTriple _triple;
ad_utility::HashSet<Variable> _variables;
std::optional<Variable> _cvar = std::nullopt;
string _wordPart;
std::optional<std::vector<std::string>> _wordPart = std::nullopt;
};

// Allows for manually building triple graphs for testing
TripleGraph(const std::vector<std::pair<Node, std::vector<size_t>>>& init);
explicit TripleGraph(
const std::vector<std::pair<Node, std::vector<size_t>>>& init);

// Checks for id and order independent equality
bool isSimilar(const TripleGraph& other) const;
Expand Down
10 changes: 5 additions & 5 deletions src/engine/TextOperationWithoutFilter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@ size_t TextOperationWithoutFilter::getResultWidth() const {

// _____________________________________________________________________________
TextOperationWithoutFilter::TextOperationWithoutFilter(
QueryExecutionContext* qec, const string& words,
const SetOfVariables& variables, const Variable& cvar, size_t textLimit)
QueryExecutionContext* qec, const std::vector<std::string>& words,
SetOfVariables variables, Variable cvar, size_t textLimit)
: Operation(qec),
_words(words),
_variables(variables),
_cvar(cvar),
_words(absl::StrJoin(words, " ")),
_variables(std::move(variables)),
_cvar(std::move(cvar)),
_textLimit(textLimit),
_sizeEstimate(std::numeric_limits<size_t>::max()) {}

Expand Down
9 changes: 5 additions & 4 deletions src/engine/TextOperationWithoutFilter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class TextOperationWithoutFilter : public Operation {
using SetOfVariables = ad_utility::HashSet<Variable>;

private:
const string _words;
string _words;
const SetOfVariables _variables;
const Variable _cvar;

Expand All @@ -28,9 +28,10 @@ class TextOperationWithoutFilter : public Operation {
vector<float> _multiplicities;

public:
TextOperationWithoutFilter(QueryExecutionContext* qec, const string& words,
const SetOfVariables& variables,
const Variable& cvar, size_t textLimit = 1);
TextOperationWithoutFilter(QueryExecutionContext* qec,
const std::vector<std::string>& words,
SetOfVariables variables, Variable cvar,
size_t textLimit = 1);

protected:
virtual string asStringImpl(size_t indent = 0) const override;
Expand Down
30 changes: 18 additions & 12 deletions src/engine/sparqlExpressions/LiteralExpression.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
// Copyright 2021, University of Freiburg, Chair of Algorithms and Data
// Structures. Author: Johannes Kalmbach <[email protected]>

//
// Created by johannes on 29.09.21.
//
// Copyright 2021, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <[email protected]>

#ifndef QLEVER_LITERALEXPRESSION_H
#define QLEVER_LITERALEXPRESSION_H
Expand All @@ -20,23 +17,28 @@ template <typename T>
class LiteralExpression : public SparqlExpression {
public:
// _________________________________________________________________________
LiteralExpression(T _value) : _value{std::move(_value)} {}
explicit LiteralExpression(T _value) : _value{std::move(_value)} {}

// A simple getter for the stored value.
const T& value() const { return _value; }

// Evaluating just returns the constant/literal value.
ExpressionResult evaluate(
[[maybe_unused]] EvaluationContext* context) const override {
if constexpr (std::is_same_v<string, T>) {
// Common code for the `Literal` and `std::string` case.
auto getIdOrString = [&context](const std::string& s) -> ExpressionResult {
Id id;
bool idWasFound = context->_qec.getIndex().getId(_value, &id);
bool idWasFound = context->_qec.getIndex().getId(s, &id);
if (!idWasFound) {
// no vocabulary entry found, just use it as a string constant.
// TODO<joka921>:: emit a warning.
return _value;
return s;
}
return id;
};
if constexpr (std::is_same_v<TripleComponent::Literal, T>) {
return getIdOrString(_value.rawContent());
} else if constexpr (std::is_same_v<string, T>) {
return getIdOrString(_value);
} else if constexpr (std::is_same_v<Variable, T>) {
// If a variable is grouped, then we know that it always has the same
// value and can treat it as a constant. This is not possible however when
Expand Down Expand Up @@ -91,6 +93,8 @@ class LiteralExpression : public SparqlExpression {
return _value;
} else if constexpr (std::is_same_v<T, ValueId>) {
return absl::StrCat("#valueId ", _value.getBits(), "#");
} else if constexpr (std::is_same_v<T, TripleComponent::Literal>) {
return absl::StrCat("#literal: ", _value.rawContent());
} else {
return {std::to_string(_value)};
}
Expand Down Expand Up @@ -120,7 +124,9 @@ using BoolExpression = detail::LiteralExpression<bool>;
using IntExpression = detail::LiteralExpression<int64_t>;
using DoubleExpression = detail::LiteralExpression<double>;
using VariableExpression = detail::LiteralExpression<::Variable>;
using StringOrIriExpression = detail::LiteralExpression<string>;
using IriExpression = detail::LiteralExpression<string>;
using StringLiteralExpression =
detail::LiteralExpression<TripleComponent::Literal>;
using IdExpression = detail::LiteralExpression<ValueId>;
} // namespace sparqlExpression

Expand Down
34 changes: 20 additions & 14 deletions src/engine/sparqlExpressions/RegexExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,8 @@ std::optional<std::string> getPrefixRegex(std::string regex) {
// Assert that `input` starts and ends with double quotes `"` and remove those
// quotes.
std::string removeQuotes(std::string_view input) {
AD_CONTRACT_CHECK(input.size() >= 2);
// Currently, IRIs are also passed as strings, but are not allowed here.
if (input.starts_with('<')) {
AD_CONTRACT_CHECK(input.ends_with('>'));
throw std::runtime_error(
"An IRI was passed as the second or third argument to the REGEX "
"function, but only string literals are allowed.");
}
AD_CONTRACT_CHECK(input.starts_with('"'));
AD_CONTRACT_CHECK(input.ends_with('"'));
AD_CORRECTNESS_CHECK(input.size() >= 2 && input.starts_with('"') &&
input.ends_with('"'));
input.remove_prefix(1);
input.remove_suffix(1);
return std::string{input};
Expand All @@ -95,18 +87,32 @@ RegexExpression::RegexExpression(
}
std::string regexString;
std::string originalRegexString;
if (auto regexPtr = dynamic_cast<const StringOrIriExpression*>(regex.get())) {
originalRegexString = regexPtr->value();
if (auto regexPtr =
dynamic_cast<const StringLiteralExpression*>(regex.get())) {
originalRegexString = regexPtr->value().normalizedLiteralContent().get();
if (!regexPtr->value().datatypeOrLangtag().empty()) {
throw std::runtime_error(
"The second argument to the REGEX function (which contains the "
"regular expression) must not contain a language tag or a datatype");
}
regexString = detail::removeQuotes(originalRegexString);
} else {
throw std::runtime_error(
"The second argument to the REGEX function must be a "
"string literal (which contains the regular expression)");
}
if (optionalFlags.has_value()) {
if (auto flagsPtr = dynamic_cast<const StringOrIriExpression*>(
if (auto flagsPtr = dynamic_cast<const StringLiteralExpression*>(
optionalFlags.value().get())) {
auto flags = detail::removeQuotes(flagsPtr->value());
std::string_view originalFlags =
flagsPtr->value().normalizedLiteralContent().get();
if (!flagsPtr->value().datatypeOrLangtag().empty()) {
throw std::runtime_error(
"The third argument to the REGEX function (which contains optional "
"flags to configure the evaluation) must not contain a language "
"tag or a datatype");
}
auto flags = detail::removeQuotes(originalFlags);
auto firstInvalidFlag = flags.find_first_not_of("imsu");
if (firstInvalidFlag != std::string::npos) {
throw std::runtime_error{absl::StrCat(
Expand Down
9 changes: 7 additions & 2 deletions src/engine/sparqlExpressions/RelationalExpressions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,13 +398,18 @@ RelationalExpression<Comp>::getLanguageFilterExpression() const {
[](const auto& left, const auto& right) -> std::optional<LangFilterData> {
const auto* varPtr = dynamic_cast<const LangExpression*>(left.get());
const auto* langPtr =
dynamic_cast<const StringOrIriExpression*>(right.get());
dynamic_cast<const StringLiteralExpression*>(right.get());

if (!varPtr || !langPtr) {
return std::nullopt;
}

return LangFilterData{varPtr->variable(), langPtr->value()};
// TODO<joka921> Check that the language string doesn't contain a datatype
// etc.
// TODO<joka921> Is this even allowed by the grammar?
return LangFilterData{
varPtr->variable(),
std::string{langPtr->value().normalizedLiteralContent().get()}};
};

const auto& child1 = children_[0];
Expand Down
2 changes: 1 addition & 1 deletion src/global/TypedIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
#ifndef QLEVER_TYPEDINDEX_H
#define QLEVER_TYPEDINDEX_H

#include "../util/ConstexprSmallString.h"
#include "util/ConstexprSmallString.h"

namespace ad_utility {
using IndexTag = ConstexprSmallString<30>;
Expand Down
3 changes: 2 additions & 1 deletion src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,8 @@ LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
if (idIfNotString.has_value()) {
resultTriple[2] = idIfNotString.value();
} else {
resultTriple[2] = std::move(triple._object.getString());
// `toRdfLiteral` handles literals as well as IRIs correctly.
resultTriple[2] = std::move(triple._object).toRdfLiteral();
}

for (size_t i = 0; i < 3; ++i) {
Expand Down
Loading

0 comments on commit bf5c70a

Please sign in to comment.