Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix the cost estimate of SORT to not prefer binary search filters with large results #1663

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions src/engine/Sort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@

#include "engine/CallFixedSize.h"
#include "engine/Engine.h"
#include "engine/Filter.h"
#include "engine/IndexScan.h"
#include "engine/QueryExecutionTree.h"
#include "global/RuntimeParameters.h"

Expand Down Expand Up @@ -74,3 +76,48 @@
LOG(DEBUG) << "Sort result computation done." << endl;
return {std::move(idTable), resultSortedOn(), subRes->getSharedLocalVocab()};
}

// _____________________________________________________________________________
size_t Sort::getCostEstimate() {
size_t size = getSizeEstimateBeforeLimit();
size_t logSize =
size < 4 ? 2 : static_cast<size_t>(logb(static_cast<double>(size)));
size_t nlogn = size * logSize;
size_t subcost = subtree_->getCostEstimate();
// Return at least 1, s.t. the query planner will never emit an unnecessary
// sort of an empty `IndexScan`. This makes the testing of the query
// planner much easier.

// Don't return plain `n log n` but also incorporate the number of columns and
// a constant multiplicator for the inherent complexity of sorting.
auto result = std::max(1UL, 20 * getResultWidth() * (nlogn + subcost));

// Determine if the subtree is a FILTER of an INDEX SCAN. This case can be
// useful if the FILTER can be applied via binary search and the result is
// then so small that the SORT doesn't hurt anymore. But in case the FILTER
// doesn't filter out much, and the result size is beyond a configurable
// threshold, we want to heavily discourage the plan with the binary filter +
// sorting, because it breaks the lazy evaluation.
auto sizeEstimateOfFilteredScan = [&]() -> size_t {
if (auto filter =
dynamic_cast<const Filter*>(subtree_->getRootOperation().get())) {
if (dynamic_cast<const IndexScan*>(
filter->getSubtree()->getRootOperation().get())) {
return subtree_->getSizeEstimate();
}
}
return 0;
}();
size_t maxSizeFilteredScan =
RuntimeParameters()
.get<"max-materialization-size-filtered-scan">()
.getBytes() /
sizeof(Id) / subtree_->getResultWidth();
if (sizeEstimateOfFilteredScan > maxSizeFilteredScan) {
// If the filtered result is larger than the defined threshold, make the
// cost estimate much larger, s.t. the query planner will prefer a plan
// without the `SORT`.
result *= 10'000;
}

Check warning on line 121 in src/engine/Sort.cpp

View check run for this annotation

Codecov / codecov/patch

src/engine/Sort.cpp#L120-L121

Added lines #L120 - L121 were not covered by tests
return result;
}
12 changes: 1 addition & 11 deletions src/engine/Sort.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,7 @@ class Sort : public Operation {

std::shared_ptr<QueryExecutionTree> getSubtree() const { return subtree_; }

virtual size_t getCostEstimate() override {
size_t size = getSizeEstimateBeforeLimit();
size_t logSize =
size < 4 ? 2 : static_cast<size_t>(logb(static_cast<double>(size)));
size_t nlogn = size * logSize;
size_t subcost = subtree_->getCostEstimate();
// Return at least 1, s.t. the query planner will never emit an unnecessary
// sort of an empty `IndexScan`. This makes the testing of the query
// planner much easier.
return std::max(1UL, nlogn + subcost);
}
size_t getCostEstimate() override;

virtual bool knownEmptyResult() override {
return subtree_->knownEmptyResult();
Expand Down
4 changes: 3 additions & 1 deletion src/global/RuntimeParameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,9 @@ inline auto& RuntimeParameters() {
Bool<"throw-on-unbound-variables">{false},
// Control up until which size lazy results should be cached. Caching
// does cause significant overhead for this case.
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB}};
MemorySizeParameter<"lazy-result-max-cache-size">{5_MB},
MemorySizeParameter<"max-materialization-size-filtered-scan">{100_MB},
};
}();
return params;
}
Expand Down
Loading