Skip to content

Commit

Permalink
Support all expressions in HAVING and ORDER BY (#914)
Browse files Browse the repository at this point in the history
So far, certain expressions in HAVING or ORDER BY were not supported. For example, it was not possible to use a variable defined as the target of an alias (in the SELECT clause) in a HAVING or ORDER BY expression. In combination with GROUP BY, it was not possible to use aggregates of the non-grouped variables. Now all valid expressions are supported.

On the side, introduce a macro `AD_EXPECT_THROW_WITH_MESSAGE` that allows unit tests that check not only that an error occured, but also that it occurred with the right error message.
  • Loading branch information
joka921 authored Mar 16, 2023
1 parent bf5c70a commit c9b1958
Show file tree
Hide file tree
Showing 27 changed files with 775 additions and 319 deletions.
5 changes: 5 additions & 0 deletions e2e/queryit.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class ErrorReportingHandler(urllib.request.BaseHandler):
def http_error_default(self, req, fp, code, msg, hdrs):
if code != 200:
eprint(f"Status {code} for {req.full_url}")
eprint(f"Message {msg} headers {hdrs}")
eprint(fp.read())
return fp


Expand Down Expand Up @@ -228,6 +230,9 @@ def test_check(check_dict: Dict[str, Any], result: Dict[str, Any]) -> bool:
if not found:
eprint("contains_row check failed:\n" +
"\tdid not find %r" % gold_row)
print("Printing the first 10 rows of the result:")
for row in res[:10]:
print(row)
return False
elif check == 'contains_warning':
for requested_warning in value:
Expand Down
41 changes: 41 additions & 0 deletions e2e/scientists_queries.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1425,4 +1425,45 @@ queries:
- contains_row: [ 3.0, 2.5]
- contains_row: [ 4.0, 2.2]

- query: aggregate-in-having-clause
type: no-text
sparql: |
SELECT ?x WHERE {
VALUES (?x ?y) { (0.5 1.5) (0.5 3.5) (1.2 2.8)}
} GROUP BY ?x
HAVING (?x + COUNT(?y) > 2.3)
checks:
- num_cols: 1
- num_rows : 1
- selected: [ "?x" ]
- contains_row: [ 0.5 ]

- query: aggregate-in-order-by
type: no-text
sparql: |
SELECT (AVG(?x + ?y) as ?avg) WHERE {
VALUES (?x ?y) { (0.5 1.5) (0.5 3.5) (1.2 2.8)}
} GROUP BY ?x
ORDER BY (- SUM(?y))
checks:
- num_cols: 1
- num_rows : 2
- selected: [ "?avg" ]
- contains_row: [ 3.0 ]
- contains_row: [ 4.0 ]

- query: variable-from-select-clause-reusage
type: no-text
sparql: |
SELECT (AVG(?x + ?y) as ?avg) (?avg + COUNT(?y) * 0.5 AS ?b) WHERE {
VALUES (?x ?y) { (0.5 1.5) (0.5 3.5) (1.2 2.8)}
} GROUP BY ?x
checks:
- num_cols: 2
- num_rows : 2
- selected: [ "?avg", "?b" ]
- contains_row: [ 3.0, 4.0 ]
- contains_row: [ 4.0, 4.5 ]



38 changes: 32 additions & 6 deletions src/engine/GroupBy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector<Variable> groupByVariables,
: Operation{qec},
_groupByVariables{std::move(groupByVariables)},
_aliases{std::move(aliases)} {
// sort the aliases and groupByVariables to ensure the cache key is order
// Sort the groupByVariables to ensure that the cache key is order
// invariant.
std::ranges::sort(_aliases, std::less<>{},
[](const Alias& a) { return a._target.name(); });

// Note: It is tempting to do the same also for the aliases, but that would
// break the case when an alias reuses a variable that was bound by a previous
// alias.
std::ranges::sort(_groupByVariables, std::less<>{}, &Variable::name);

auto sortColumns = computeSortColumns(subtree.get());
Expand All @@ -39,8 +39,22 @@ GroupBy::GroupBy(QueryExecutionContext* qec, vector<Variable> groupByVariables,
}

string GroupBy::asStringImpl(size_t indent) const {
const auto varMap = getInternallyVisibleVariableColumns();
const auto varMapInput = _subtree->getVariableColumns();
const auto& varMap = getInternallyVisibleVariableColumns();
auto varMapInput = _subtree->getVariableColumns();

// We also have to encode the variables to which alias results are stored in
// the cache key of the expressions in case they reuse a variable from the
// previous result.
auto numColumnsInput = _subtree->getResultWidth();
for (const auto& [var, column] : varMap) {
if (!varMapInput.contains(var)) {
// It is important that the cache keys for the variables from the aliases
// do not collide with the query body, and that they are consistent. The
// constant `1000` has no deeper meaning but makes debugging easier.
varMapInput[var] = column + 1000 + numColumnsInput;
}
}

std::ostringstream os;
for (size_t i = 0; i < indent; ++i) {
os << " ";
Expand Down Expand Up @@ -166,6 +180,11 @@ void GroupBy::processGroup(

auto& resultEntry = result->operator()(resultRow, resultColumn);

// Copy the result to the evaluation context in case one of the following
// aliases has to reuse it.
evaluationContext._previousResultsFromSameGroup.at(resultColumn) =
sparqlExpression::copyExpressionResultIfNotVector(expressionResult);

auto visitor = [&]<sparqlExpression::SingleExpressionResult T>(
T&& singleResult) mutable {
constexpr static bool isStrongId = std::is_same_v<T, Id>;
Expand Down Expand Up @@ -227,8 +246,15 @@ void GroupBy::doGroupBy(const IdTable& dynInput,
*getExecutionContext(), columnMap, inTable->_idTable,
getExecutionContext()->getAllocator(), outTable->localVocabNonConst());

// In a GROUP BY evaluation, the expressions need to know which variables are
// grouped, and to which columns the results of the aliases are written. The
// latter information is needed if the expression of an reuses the result
// variable from a previous alias as an input.
evaluationContext._groupedVariables = ad_utility::HashSet<Variable>{
_groupByVariables.begin(), _groupByVariables.end()};
evaluationContext._variableToColumnMapPreviousResults =
getInternallyVisibleVariableColumns();
evaluationContext._previousResultsFromSameGroup.resize(getResultWidth());

auto processNextBlock = [&](size_t blockStart, size_t blockEnd) {
result.emplace_back();
Expand Down
2 changes: 1 addition & 1 deletion src/engine/sparqlExpressions/AggregateExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ AggregateExpression<AggregateOperation, FinalOperation>::children() {

// _________________________________________________________________________
template <typename AggregateOperation, typename FinalOperation>
vector<std::string> AggregateExpression<
vector<Variable> AggregateExpression<
AggregateOperation, FinalOperation>::getUnaggregatedVariables() {
// This is an aggregate, so it never leaves any unaggregated variables.
return {};
Expand Down
2 changes: 1 addition & 1 deletion src/engine/sparqlExpressions/AggregateExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class AggregateExpression : public SparqlExpression {
std::span<SparqlExpression::Ptr> children() override;

// _________________________________________________________________________
vector<std::string> getUnaggregatedVariables() override;
vector<Variable> getUnaggregatedVariables() override;

// An `AggregateExpression` (obviously) contains an aggregate.
bool containsAggregate() const override { return true; }
Expand Down
2 changes: 1 addition & 1 deletion src/engine/sparqlExpressions/GroupConcatExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class GroupConcatExpression : public SparqlExpression {

// A `GroupConcatExpression` is an aggregate, so it never leaves any
// unaggregated variables.
vector<std::string> getUnaggregatedVariables() override { return {}; }
vector<Variable> getUnaggregatedVariables() override { return {}; }

// A `GroupConcatExpression` is an aggregate.
bool containsAggregate() const override { return true; }
Expand Down
67 changes: 45 additions & 22 deletions src/engine/sparqlExpressions/LiteralExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <[email protected]>

#ifndef QLEVER_LITERALEXPRESSION_H
#define QLEVER_LITERALEXPRESSION_H
#pragma once

#include "engine/sparqlExpressions/SparqlExpression.h"

Expand Down Expand Up @@ -40,22 +39,7 @@ class LiteralExpression : public SparqlExpression {
} else if constexpr (std::is_same_v<string, T>) {
return getIdOrString(_value);
} else if constexpr (std::is_same_v<Variable, T>) {
// If a variable is grouped, then we know that it always has the same
// value and can treat it as a constant. This is not possible however when
// we are inside an aggregate, because for example `SUM(?variable)` must
// still compute the sum over the whole group.
if (context->_groupedVariables.contains(_value) && !isInsideAggregate()) {
auto column = context->getColumnIndexForVariable(_value);
const auto& table = context->_inputTable;
auto constantValue = table.at(context->_beginIndex, column);
assert((std::ranges::all_of(
table.begin() + context->_beginIndex,
table.begin() + context->_endIndex,
[&](const auto& row) { return row[column] == constantValue; })));
return constantValue;
} else {
return _value;
}
return evaluateIfVariable(context, _value);
} else {
return _value;
}
Expand All @@ -74,9 +58,9 @@ class LiteralExpression : public SparqlExpression {
}

// _________________________________________________________________________
vector<std::string> getUnaggregatedVariables() override {
vector<Variable> getUnaggregatedVariables() override {
if constexpr (std::is_same_v<T, ::Variable>) {
return {_value.name()};
return {_value};
} else {
return {};
}
Expand Down Expand Up @@ -116,6 +100,47 @@ class LiteralExpression : public SparqlExpression {

private:
T _value;

// Evaluate the expression if it is a variable expression with the given
// `variable`. The variable is passed in explicitly because this function
// might be called recursively.
ExpressionResult evaluateIfVariable(EvaluationContext* context,
const Variable& variable) const {
// If this is a variable that is not visible in the input but was bound by a
// previous alias in the same SELECT clause, then read the constant value of
// the variable from the data structures dedicated to this case.
if (auto resultFromSameRow =
context->getResultFromPreviousAggregate(variable);
resultFromSameRow.has_value() &&
!context->_groupedVariables.contains(variable)) {
// If the expression is a simple renaming of a variable `(?x AS ?y)` we
// have to recurse to track a possible chain of such renamings in the
// SELECT clause.
if (const Variable* var =
std::get_if<Variable>(&resultFromSameRow.value());
var != nullptr) {
return evaluateIfVariable(context, *var);
} else {
return std::move(resultFromSameRow.value());
}
}
// If a variable is grouped, then we know that it always has the same
// value and can treat it as a constant. This is not possible however when
// we are inside an aggregate, because for example `SUM(?variable)` must
// still compute the sum over the whole group.
if (context->_groupedVariables.contains(variable) && !isInsideAggregate()) {
auto column = context->getColumnIndexForVariable(variable);
const auto& table = context->_inputTable;
auto constantValue = table.at(context->_beginIndex, column);
assert((std::ranges::all_of(
table.begin() + context->_beginIndex,
table.begin() + context->_endIndex,
[&](const auto& row) { return row[column] == constantValue; })));
return constantValue;
} else {
return variable;
}
}
};
} // namespace detail

Expand All @@ -129,5 +154,3 @@ using StringLiteralExpression =
detail::LiteralExpression<TripleComponent::Literal>;
using IdExpression = detail::LiteralExpression<ValueId>;
} // namespace sparqlExpression

#endif // QLEVER_LITERALEXPRESSION_H
9 changes: 3 additions & 6 deletions src/engine/sparqlExpressions/SampleExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,9 @@ ExpressionResult SampleExpression::evaluate(EvaluationContext* context) const {
} else if constexpr (std::is_same_v<T, ::Variable>) {
// TODO<joka921> Can't this be a simpler function (getIdAt)
AD_CONTRACT_CHECK(context->_endIndex > context->_beginIndex);
EvaluationContext contextForSingleValue = *context;
contextForSingleValue._endIndex = contextForSingleValue._beginIndex + 1;
auto idOfFirstAsVector =
detail::getIdsFromVariable(childResult, &contextForSingleValue);
return idOfFirstAsVector[0];
;
std::span<const ValueId> idOfFirstAsVector = detail::getIdsFromVariable(
childResult, context, context->_beginIndex, context->_endIndex);
return ExpressionResult{idOfFirstAsVector[0]};
} else {
static_assert(isConstantResult<T>);
return childResult;
Expand Down
6 changes: 4 additions & 2 deletions src/engine/sparqlExpressions/SampleExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ namespace sparqlExpression {
class SampleExpression : public SparqlExpression {
public:
SampleExpression([[maybe_unused]] bool distinct, Ptr&& child)
: _child{std::move(child)} {}
: _child{std::move(child)} {
setIsInsideAggregate();
}

// __________________________________________________________________________
ExpressionResult evaluate(EvaluationContext* context) const override;

// _____________________________________________________________________
vector<std::string> getUnaggregatedVariables() override {
vector<Variable> getUnaggregatedVariables() override {
// This is an aggregation, so it never leaves any unaggregated variables.
return {};
}
Expand Down
4 changes: 2 additions & 2 deletions src/engine/sparqlExpressions/SparqlExpression.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ class SparqlExpression {

/// Return all the variables that occur in the expression, but are not
/// aggregated.
virtual vector<std::string> getUnaggregatedVariables() {
virtual vector<Variable> getUnaggregatedVariables() {
// Default implementation: This expression adds no variables, but all
// unaggregated variables from the children remain unaggregated.
std::vector<string> result;
std::vector<Variable> result;
for (const auto& child : children()) {
auto childResult = child->getUnaggregatedVariables();
result.insert(result.end(), std::make_move_iterator(childResult.begin()),
Expand Down
58 changes: 26 additions & 32 deletions src/engine/sparqlExpressions/SparqlExpressionGenerators.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,12 @@

namespace sparqlExpression::detail {

// Internal implementation of `getIdsFromVariable` (see below).
// It is required because of the `CALL_FIXED_SIZE` mechanism for the `IdTable`s.
template <size_t WIDTH>
void getIdsFromVariableImpl(VectorWithMemoryLimit<ValueId>& result,
const ::Variable& variable,
EvaluationContext* context) {
AD_CONTRACT_CHECK(result.empty());
const auto inputTable = context->_inputTable.asStaticView<WIDTH>();

const size_t beginIndex = context->_beginIndex;
const size_t endIndex = context->_endIndex;
/// Convert a variable to a vector of all the Ids it is bound to in the
/// `context`.
inline std::span<const ValueId> getIdsFromVariable(
const ::Variable& variable, const EvaluationContext* context,
size_t beginIndex, size_t endIndex) {
const auto& inputTable = context->_inputTable;

if (!context->_variableToColumnAndResultTypeMap.contains(variable)) {
throw std::runtime_error(
Expand All @@ -34,22 +29,20 @@ void getIdsFromVariableImpl(VectorWithMemoryLimit<ValueId>& result,
const size_t columnIndex =
context->_variableToColumnAndResultTypeMap.at(variable).first;

result.reserve(endIndex - beginIndex);
for (size_t i = beginIndex; i < endIndex; ++i) {
result.push_back(ValueId{inputTable(i, columnIndex)});
}
std::span<const ValueId> completeColumn = inputTable.getColumn(columnIndex);

AD_CONTRACT_CHECK(beginIndex <= endIndex &&
endIndex <= completeColumn.size());
return {completeColumn.begin() + beginIndex,
completeColumn.begin() + endIndex};
}

/// Convert a variable to a vector of all the Ids it is bound to in the
/// `context`.
// TODO<joka921> Restructure QLever to column based design, then this will
// become a noop;
inline VectorWithMemoryLimit<ValueId> getIdsFromVariable(
const ::Variable& variable, EvaluationContext* context) {
auto cols = context->_inputTable.numColumns();
VectorWithMemoryLimit<ValueId> result{context->_allocator};
CALL_FIXED_SIZE(cols, &getIdsFromVariableImpl, result, variable, context);
return result;
// Overload that reads the `beginIndex` and the `endIndex` directly from the
// `context
inline std::span<const ValueId> getIdsFromVariable(
const ::Variable& variable, const EvaluationContext* context) {
return getIdsFromVariable(variable, context, context->_beginIndex,
context->_endIndex);
}

/// Generators that yield `numItems` items for the various
Expand All @@ -62,12 +55,14 @@ requires isConstantResult<T> cppcoro::generator<T> resultGenerator(
}
}

template <SingleExpressionResult T>
requires isVectorResult<T> cppcoro::generator<typename T::value_type>
resultGenerator(T vector, size_t numItems) {
template <typename T>
requires isVectorResult<T>
auto resultGenerator(T vector, size_t numItems)
-> cppcoro::generator<std::remove_reference_t<decltype(vector[0])>> {
AD_CONTRACT_CHECK(numItems == vector.size());
for (auto& element : vector) {
co_yield std::move(element);
auto cpy = std::move(element);
co_yield cpy;
}
}

Expand All @@ -94,10 +89,9 @@ inline cppcoro::generator<Bool> resultGenerator(ad_utility::SetOfIntervals set,
template <SingleExpressionResult Input>
auto makeGenerator(Input&& input, size_t numItems, EvaluationContext* context) {
if constexpr (ad_utility::isSimilar<::Variable, Input>) {
// TODO: Also directly write a generator that lazily gets the Ids in chunks.
StrongIdsWithResultType inputWithVariableResolved{
std::span<const ValueId> inputWithVariableResolved{
getIdsFromVariable(std::forward<Input>(input), context)};
return resultGenerator(std::move(inputWithVariableResolved), numItems);
return resultGenerator(inputWithVariableResolved, numItems);
} else {
return resultGenerator(std::forward<Input>(input), numItems);
}
Expand Down
Loading

0 comments on commit c9b1958

Please sign in to comment.