From 81a93505e06e777f3063eac5181337c8e74cb57e Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 15 Oct 2024 18:14:06 +0200 Subject: [PATCH] Implement lazy `BIND` (#1543) Allow the `BIND` operation to handle its input lazily. NOTE: Currently there is only a single local vocab for all the results of the `BIND`, so even when a `BIND` that creates strings is handled lazily, we still need the RAM for the complete local vocab. This will be handled in a follow-up PR. --- src/engine/Bind.cpp | 147 +++++++++++++++++++---------------- src/engine/Bind.h | 32 ++++---- src/engine/idTable/IdTable.h | 6 ++ test/IdTableTest.cpp | 17 ++++ test/engine/BindTest.cpp | 133 +++++++++++++++++++++++++++++++ test/engine/CMakeLists.txt | 1 + 6 files changed, 252 insertions(+), 84 deletions(-) create mode 100644 test/engine/BindTest.cpp diff --git a/src/engine/Bind.cpp b/src/engine/Bind.cpp index 83aef56e2e..dde7e019d9 100644 --- a/src/engine/Bind.cpp +++ b/src/engine/Bind.cpp @@ -81,64 +81,84 @@ std::vector Bind::getChildren() { } // _____________________________________________________________________________ -ProtoResult Bind::computeResult([[maybe_unused]] bool requestLaziness) { - using std::endl; - LOG(DEBUG) << "Get input to BIND operation..." << endl; - std::shared_ptr subRes = _subtree->getResult(); - LOG(DEBUG) << "Got input to Bind operation." << endl; - IdTable idTable{getExecutionContext()->getAllocator()}; - - idTable.setNumColumns(getResultWidth()); - - // Make a deep copy of the local vocab from `subRes` and then add to it (in - // case BIND adds a new word or words). - // - // TODO: In most BIND operations, nothing is added to the local vocabulary, so - // it would be more efficient to first share the pointer here (like with - // `shareLocalVocabFrom`) and only copy it when a new word is about to be - // added. Same for GROUP BY. - auto localVocab = subRes->getCopyOfLocalVocab(); - - size_t inwidth = subRes->idTable().numColumns(); - size_t outwidth = getResultWidth(); - - CALL_FIXED_SIZE((std::array{inwidth, outwidth}), &Bind::computeExpressionBind, - this, &idTable, &localVocab, *subRes, - _bind._expression.getPimpl()); - - LOG(DEBUG) << "BIND result computation done." << endl; - return {std::move(idTable), resultSortedOn(), std::move(localVocab)}; +IdTable Bind::cloneSubView(const IdTable& idTable, + const std::pair& subrange) { + IdTable result(idTable.numColumns(), idTable.getAllocator()); + result.resize(subrange.second - subrange.first); + std::ranges::copy(idTable.begin() + subrange.first, + idTable.begin() + subrange.second, result.begin()); + return result; } // _____________________________________________________________________________ -template -void Bind::computeExpressionBind( - IdTable* outputIdTable, LocalVocab* outputLocalVocab, - const Result& inputResultTable, - sparqlExpression::SparqlExpression* expression) const { +ProtoResult Bind::computeResult(bool requestLaziness) { + LOG(DEBUG) << "Get input to BIND operation..." << std::endl; + std::shared_ptr subRes = _subtree->getResult(requestLaziness); + LOG(DEBUG) << "Got input to Bind operation." << std::endl; + + auto applyBind = [this, subRes](IdTable idTable, LocalVocab* localVocab) { + return computeExpressionBind(localVocab, std::move(idTable), + subRes->localVocab(), + _bind._expression.getPimpl()); + }; + + if (subRes->isFullyMaterialized()) { + if (requestLaziness && subRes->idTable().size() > CHUNK_SIZE) { + auto localVocab = + std::make_shared(subRes->getCopyOfLocalVocab()); + auto generator = [](std::shared_ptr vocab, auto applyBind, + std::shared_ptr result) + -> cppcoro::generator { + size_t size = result->idTable().size(); + for (size_t offset = 0; offset < size; offset += CHUNK_SIZE) { + co_yield applyBind( + cloneSubView(result->idTable(), + {offset, std::min(size, offset + CHUNK_SIZE)}), + vocab.get()); + } + }(localVocab, std::move(applyBind), std::move(subRes)); + return {std::move(generator), resultSortedOn(), std::move(localVocab)}; + } + // Make a deep copy of the local vocab from `subRes` and then add to it (in + // case BIND adds a new word or words). + // + // Make a copy of the local vocab from`subRes`and then add to it (in case + // BIND adds new words). Note: The copy of the local vocab is shallow + // via`shared_ptr`s, so the following is also efficient if the BIND adds no + // new words. + LocalVocab localVocab = subRes->getCopyOfLocalVocab(); + IdTable result = applyBind(subRes->idTable().clone(), &localVocab); + LOG(DEBUG) << "BIND result computation done." << std::endl; + return {std::move(result), resultSortedOn(), std::move(localVocab)}; + } + auto localVocab = std::make_shared(); + auto generator = + [](std::shared_ptr vocab, auto applyBind, + std::shared_ptr result) -> cppcoro::generator { + for (IdTable& idTable : result->idTables()) { + co_yield applyBind(std::move(idTable), vocab.get()); + } + std::array vocabs{vocab.get(), &result->localVocab()}; + *vocab = LocalVocab::merge(std::span{vocabs}); + }(localVocab, std::move(applyBind), std::move(subRes)); + return {std::move(generator), resultSortedOn(), std::move(localVocab)}; +} + +// _____________________________________________________________________________ +IdTable Bind::computeExpressionBind( + LocalVocab* outputLocalVocab, IdTable idTable, + const LocalVocab& inputLocalVocab, + const sparqlExpression::SparqlExpression* expression) const { sparqlExpression::EvaluationContext evaluationContext( - *getExecutionContext(), _subtree->getVariableColumns(), - inputResultTable.idTable(), getExecutionContext()->getAllocator(), - inputResultTable.localVocab(), cancellationHandle_, deadline_); + *getExecutionContext(), _subtree->getVariableColumns(), idTable, + getExecutionContext()->getAllocator(), inputLocalVocab, + cancellationHandle_, deadline_); sparqlExpression::ExpressionResult expressionResult = expression->evaluate(&evaluationContext); - const auto input = inputResultTable.idTable().asStaticView(); - auto output = std::move(*outputIdTable).toStatic(); - - // first initialize the first columns (they remain identical) - const auto inSize = input.size(); - output.reserve(inSize); - const auto inCols = input.numColumns(); - // copy the input to the first numColumns; - for (size_t i = 0; i < inSize; ++i) { - output.emplace_back(); - for (size_t j = 0; j < inCols; ++j) { - output(i, j) = input(i, j); - } - checkCancellation(); - } + idTable.addEmptyColumn(); + auto outputColumn = idTable.getColumn(idTable.numColumns() - 1); auto visitor = [&]( T&& singleResult) mutable { @@ -146,22 +166,19 @@ void Bind::computeExpressionBind( constexpr static bool isStrongId = std::is_same_v; if constexpr (isVariable) { - auto column = + auto columnIndex = getInternallyVisibleVariableColumns().at(singleResult).columnIndex_; - for (size_t i = 0; i < inSize; ++i) { - output(i, inCols) = output(i, column); - checkCancellation(); - } + auto inputColumn = idTable.getColumn(columnIndex); + AD_CORRECTNESS_CHECK(inputColumn.size() == outputColumn.size()); + std::ranges::copy(inputColumn, outputColumn.begin()); } else if constexpr (isStrongId) { - for (size_t i = 0; i < inSize; ++i) { - output(i, inCols) = singleResult; - checkCancellation(); - } + std::ranges::fill(outputColumn, singleResult); } else { constexpr bool isConstant = sparqlExpression::isConstantResult; auto resultGenerator = sparqlExpression::detail::makeGenerator( - std::forward(singleResult), inSize, &evaluationContext); + std::forward(singleResult), outputColumn.size(), + &evaluationContext); if constexpr (isConstant) { auto it = resultGenerator.begin(); @@ -169,16 +186,14 @@ void Bind::computeExpressionBind( Id constantId = sparqlExpression::detail::constantExpressionResultToId( std::move(*it), *outputLocalVocab); - for (size_t i = 0; i < inSize; ++i) { - output(i, inCols) = constantId; - checkCancellation(); - } + checkCancellation(); + std::ranges::fill(outputColumn, constantId); } } else { size_t i = 0; // We deliberately move the values from the generator. for (auto& resultValue : resultGenerator) { - output(i, inCols) = + outputColumn[i] = sparqlExpression::detail::constantExpressionResultToId( std::move(resultValue), *outputLocalVocab); i++; @@ -190,5 +205,5 @@ void Bind::computeExpressionBind( std::visit(visitor, std::move(expressionResult)); - *outputIdTable = std::move(output).toDynamic(); + return idTable; } diff --git a/src/engine/Bind.h b/src/engine/Bind.h index bb1996a967..eeaafaf3ed 100644 --- a/src/engine/Bind.h +++ b/src/engine/Bind.h @@ -1,9 +1,8 @@ -// -// Created by johannes on 19.04.20. -// +// Copyright 2020, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach -#ifndef QLEVER_BIND_H -#define QLEVER_BIND_H +#pragma once #include "engine/Operation.h" #include "engine/sparqlExpressions/SparqlExpressionPimpl.h" @@ -12,6 +11,8 @@ /// BIND operation, currently only supports a very limited subset of expressions class Bind : public Operation { public: + static constexpr size_t CHUNK_SIZE = 10'000; + Bind(QueryExecutionContext* qec, std::shared_ptr subtree, parsedQuery::Bind b) : Operation(qec), _subtree(std::move(subtree)), _bind(std::move(b)) {} @@ -37,25 +38,20 @@ class Bind : public Operation { float getMultiplicity(size_t col) override; bool knownEmptyResult() override; - // Returns the variable to which the expression will be bound - [[nodiscard]] const string& targetVariable() const { - return _bind._target.name(); - } - protected: [[nodiscard]] vector resultSortedOn() const override; private: - ProtoResult computeResult([[maybe_unused]] bool requestLaziness) override; + ProtoResult computeResult(bool requestLaziness) override; + + static IdTable cloneSubView(const IdTable& idTable, + const std::pair& subrange); // Implementation for the binding of arbitrary expressions. - template - void computeExpressionBind( - IdTable* outputIdTable, LocalVocab* outputLocalVocab, - const Result& inputResultTable, - sparqlExpression::SparqlExpression* expression) const; + IdTable computeExpressionBind( + LocalVocab* outputLocalVocab, IdTable idTable, + const LocalVocab& inputLocalVocab, + const sparqlExpression::SparqlExpression* expression) const; [[nodiscard]] VariableToColumnMap computeVariableToColumnMap() const override; }; - -#endif // QLEVER_BIND_H diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index 1f1f502503..29a0257cac 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -278,6 +278,12 @@ class IdTable { data().resize(numColumns, ColumnStorage{allocator_}); } + // Add a new empty column to the table. + void addEmptyColumn() requires columnsAreAllocatable && isDynamic { + data().emplace_back(size(), allocator_); + ++numColumns_; + } + // The number of rows in the table. We deliberately have an explicitly named // function `numRows` as well as a generic `size` function because the latter // can be used to write generic code, for example when using STL algorithms on diff --git a/test/IdTableTest.cpp b/test/IdTableTest.cpp index 46fb9a0f6c..34b1ad3072 100644 --- a/test/IdTableTest.cpp +++ b/test/IdTableTest.cpp @@ -1119,6 +1119,23 @@ TEST(IdTable, constructorsAreSfinaeFriendly) { static_assert(std::is_constructible_v); } +// _____________________________________________________________________________ +TEST(IdTable, addEmptyColumn) { + using ::testing::ElementsAre; + using ::testing::Eq; + IdTable table{1, ad_utility::makeUnlimitedAllocator()}; + table.push_back({V(1)}); + table.push_back({V(2)}); + + table.addEmptyColumn(); + + EXPECT_EQ(table.numColumns(), 2); + EXPECT_THAT(table.getColumn(0), ElementsAre(V(1), V(2))); + // The new column is uninitialized, so we can't make any more specific + // assertions about its content here. + EXPECT_EQ(table.getColumn(1).size(), 2); +} + // Check that we can completely instantiate `IdTable`s with a different value // type and a different underlying storage. diff --git a/test/engine/BindTest.cpp b/test/engine/BindTest.cpp new file mode 100644 index 0000000000..d0f4309f56 --- /dev/null +++ b/test/engine/BindTest.cpp @@ -0,0 +1,133 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Robin Textor-Falconi + +#include + +#include "../util/IdTableHelpers.h" +#include "../util/IndexTestHelpers.h" +#include "./ValuesForTesting.h" +#include "engine/Bind.h" +#include "engine/sparqlExpressions/LiteralExpression.h" + +using namespace sparqlExpression; +using Vars = std::vector>; + +namespace { +Bind makeBindForIdTable(QueryExecutionContext* qec, IdTable idTable) { + auto valuesTree = ad_utility::makeExecutionTree( + qec, std::move(idTable), Vars{Variable{"?a"}}); + return { + qec, + std::move(valuesTree), + {SparqlExpressionPimpl{ + std::make_unique(Variable{"?a"}), "?a as ?b"}, + Variable{"?b"}}}; +} + +void expectBindYieldsIdTable( + QueryExecutionContext* qec, Bind& bind, const IdTable& expected, + ad_utility::source_location loc = ad_utility::source_location::current()) { + auto trace = generateLocationTrace(loc); + + { + qec->getQueryTreeCache().clearAll(); + auto result = bind.getResult(false, ComputationMode::FULLY_MATERIALIZED); + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable(), expected); + } + + { + qec->getQueryTreeCache().clearAll(); + auto result = bind.getResult(false, ComputationMode::LAZY_IF_SUPPORTED); + ASSERT_FALSE(result->isFullyMaterialized()); + auto& idTables = result->idTables(); + auto iterator = idTables.begin(); + ASSERT_NE(iterator, idTables.end()); + EXPECT_EQ(*iterator, expected); + EXPECT_EQ(++iterator, idTables.end()); + } +} +} // namespace + +// _____________________________________________________________________________ +TEST(Bind, computeResult) { + auto* qec = ad_utility::testing::getQec(); + Bind bind = + makeBindForIdTable(qec, makeIdTableFromVector({{1}, {2}, {3}, {4}})); + + expectBindYieldsIdTable( + qec, bind, makeIdTableFromVector({{1, 1}, {2, 2}, {3, 3}, {4, 4}})); +} + +// _____________________________________________________________________________ +TEST(Bind, computeResultWithTableWithoutRows) { + auto* qec = ad_utility::testing::getQec(); + Bind bind = makeBindForIdTable( + qec, IdTable{1, ad_utility::makeUnlimitedAllocator()}); + + expectBindYieldsIdTable(qec, bind, + IdTable{2, ad_utility::makeUnlimitedAllocator()}); +} + +// _____________________________________________________________________________ +TEST(Bind, computeResultWithTableWithoutColumns) { + auto val = Id::makeFromInt(42); + auto* qec = ad_utility::testing::getQec(); + auto valuesTree = ad_utility::makeExecutionTree( + qec, makeIdTableFromVector({{}, {}}), Vars{}); + Bind bind{ + qec, + std::move(valuesTree), + {SparqlExpressionPimpl{std::make_unique(val), "42 as ?b"}, + Variable{"?b"}}}; + + expectBindYieldsIdTable(qec, bind, makeIdTableFromVector({{val}, {val}})); +} + +// _____________________________________________________________________________ +TEST( + Bind, + computeResultProducesLazyResultWhenFullyMaterializedSubResultIsTooLargeAndRequested) { + auto val = Id::makeFromInt(42); + IdTable::row_type row{1}; + row[0] = val; + auto* qec = ad_utility::testing::getQec(); + IdTable table{1, ad_utility::makeUnlimitedAllocator()}; + table.resize(Bind::CHUNK_SIZE + 1); + std::ranges::fill(table, row); + auto valuesTree = ad_utility::makeExecutionTree( + qec, table.clone(), Vars{Variable{"?a"}}, false, + std::vector{}, LocalVocab{}, std::nullopt, true); + Bind bind{ + qec, + std::move(valuesTree), + {SparqlExpressionPimpl{std::make_unique(val), "42 as ?b"}, + Variable{"?b"}}}; + + table.addEmptyColumn(); + row = IdTable::row_type{2}; + row[0] = val; + row[1] = val; + std::ranges::fill(table, row); + { + qec->getQueryTreeCache().clearAll(); + auto result = bind.getResult(false, ComputationMode::FULLY_MATERIALIZED); + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable(), table); + } + + { + table.resize(Bind::CHUNK_SIZE); + qec->getQueryTreeCache().clearAll(); + auto result = bind.getResult(false, ComputationMode::LAZY_IF_SUPPORTED); + ASSERT_FALSE(result->isFullyMaterialized()); + auto& idTables = result->idTables(); + auto iterator = idTables.begin(); + ASSERT_NE(iterator, idTables.end()); + EXPECT_EQ(*iterator, table); + ASSERT_NE(++iterator, idTables.end()); + EXPECT_EQ(*iterator, makeIdTableFromVector({{val, val}})); + EXPECT_EQ(++iterator, idTables.end()); + } +} diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index 1c875c49cd..3a7e7d3cf7 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -8,3 +8,4 @@ addLinkAndDiscoverTest(DistinctTest engine) addLinkAndDiscoverTest(GroupByHashMapOptimizationTest) addLinkAndDiscoverTest(LazyGroupByTest engine) addLinkAndDiscoverTest(CountConnectedSubgraphsTest) +addLinkAndDiscoverTest(BindTest engine)