From f7c2c3219d49ce65138d5d5ee21c8d74d188470c Mon Sep 17 00:00:00 2001 From: Johannes Kalmbach Date: Wed, 17 Jan 2024 15:01:20 +0100 Subject: [PATCH] Add triples of `ql:has-pattern` predicate to PSO and POS (#1226) The PSO and POS permutation now also contain the triples of the internal ql:has-pattern predicate. These will be used as a fallback for the new pattern implementation (which will come with one of the next commits). Note that we don't need the triples in the other four permutations, so the pair PSO&POS now has more triples than SPO&SOP and OSP&OPS. --- .../idTable/CompressedExternalIdTable.h | 35 ++++++++++- src/engine/idTable/IdTable.h | 14 ++++- src/engine/idTable/IdTableRow.h | 10 +++- src/global/SpecialIds.h | 10 ++++ src/index/IndexImpl.cpp | 22 ++++++- src/index/IndexImpl.h | 41 ++++++++----- src/parser/TripleComponent.h | 3 + test/IdTableTest.cpp | 22 +++++++ test/IndexTest.cpp | 60 +++++++++---------- test/TripleComponentTest.cpp | 4 ++ .../idTable/CompressedExternalIdTableTest.cpp | 54 ++++++++++------- test/util/IdTableHelpers.h | 16 +++++ test/util/IndexTestHelpers.cpp | 15 +++++ 13 files changed, 232 insertions(+), 74 deletions(-) diff --git a/src/engine/idTable/CompressedExternalIdTable.h b/src/engine/idTable/CompressedExternalIdTable.h index 159f7d8072..0b6c197e5c 100644 --- a/src/engine/idTable/CompressedExternalIdTable.h +++ b/src/engine/idTable/CompressedExternalIdTable.h @@ -316,6 +316,10 @@ class CompressedExternalIdTableBase { CompressedExternalIdTableWriter writer_; std::future compressAndWriteFuture_; + // Store whether this table has previously already been iterated over (in + // which case this member becomes `false`). + std::atomic isFirstIteration_ = true; + [[no_unique_address]] BlockTransformation blockTransformation_{}; public: @@ -364,6 +368,7 @@ class CompressedExternalIdTableBase { } writer_.clear(); numBlocksPushed_ = 0; + isFirstIteration_ = true; } protected: @@ -401,6 +406,9 @@ class CompressedExternalIdTableBase { // until the pushing is actually finished, and return `true`. Using this // function allows for an efficient usage of this class for very small inputs. bool transformAndPushLastBlock() { + if (!isFirstIteration_) { + return numBlocksPushed_ != 0; + } // If we have pushed at least one (complete) block, then the last future // from pushing a block is still in flight. If we have never pushed a block, // then also the future cannot be valid. @@ -549,6 +557,9 @@ class CompressedExternalIdTableSorter // output phase. int numBufferedOutputBlocks_ = 4; + // See the `moveResultOnMerge()` getter function for documentation. + bool moveResultOnMerge_ = true; + public: // Constructor. CompressedExternalIdTableSorter( @@ -579,6 +590,18 @@ class CompressedExternalIdTableSorter // within this class. using Base::push; + // If set to `false` then the sorted result can be extracted multiple times. + // If set to `true` then the result is moved out and unusable after the first + // merge. In that case an exception will be thrown at the start of the second + // merge. + // Note: This mechanism gives a performance advantage for very small inputs + // that can be completely sorted in RAM. In that case we can avoid a copy of + // the sorted result. + bool& moveResultOnMerge() { + AD_CONTRACT_CHECK(this->isFirstIteration_); + return moveResultOnMerge_; + } + // Transition from the input phase, where `push()` can be called, to the // output phase and return a generator that yields the sorted elements one by // one. Either this function or the following function must be called exactly @@ -594,6 +617,8 @@ class CompressedExternalIdTableSorter requires(N == NumStaticCols || N == 0) cppcoro::generator> getSortedBlocks( std::optional blocksize = std::nullopt) { + // If we move the result out, there must only be a single merge phase. + AD_CONTRACT_CHECK(this->isFirstIteration_ || !this->moveResultOnMerge_); mergeIsActive_.store(true); // Explanation for the second argument: One block is buffered by this // generator, one block is buffered inside the `sortedBlocks` generator, so @@ -604,6 +629,7 @@ class CompressedExternalIdTableSorter std::max(1, numBufferedOutputBlocks_ - 2))) { co_yield block; } + this->isFirstIteration_ = false; mergeIsActive_.store(false); } @@ -637,8 +663,15 @@ class CompressedExternalIdTableSorter auto& block = this->currentBlock_; const auto blocksizeOutput = blocksize.value_or(block.numRows()); if (block.numRows() <= blocksizeOutput) { - co_yield std::move(this->currentBlock_).template toStatic(); + if (this->moveResultOnMerge_) { + co_yield std::move(this->currentBlock_).template toStatic(); + } else { + auto blockAsStatic = IdTableStatic( + this->currentBlock_.clone().template toStatic()); + co_yield blockAsStatic; + } } else { + // TODO Use `std::views::chunk`. for (size_t i = 0; i < block.numRows(); i += blocksizeOutput) { size_t upper = std::min(i + blocksizeOutput, block.numRows()); auto curBlock = IdTableStatic( diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index 0184b14cef..7d611c050c 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -124,7 +124,8 @@ class IdTable { static constexpr bool columnsAreAllocatable = std::is_constructible_v; - using value_type = T; + // The type of a single entry in a row. + using single_value_type = T; // Because of the column-major layout, the `row_type` (a value type that // stores the values of a single row) and the `row_reference` (a type that // refers to a specific row of a specific `IdTable`) are different. They are @@ -135,6 +136,11 @@ class IdTable { using row_reference = RowReference; using const_row_reference = RowReference; + // This alias is required to make the `IdTable` class work with advanced GTest + // features, because GTest uses `Container::value_type` directly instead of + // using `std::iterator_traits`. + using value_type = row_type; + private: // Assign shorter aliases for some types that are important for the correct // handling of the proxy reference, but that are not visible to the outside. @@ -526,7 +532,7 @@ class IdTable { // numColumns()` implies that the function applies a permutation to the table. // For example `setColumnSubset({1, 2, 0})` rotates the columns of a table // with three columns left by one element. - void setColumnSubset(std::span subset) requires isDynamic { + void setColumnSubset(std::span subset) { // First check that the `subset` is indeed a subset of the column // indices. std::vector check{subset.begin(), subset.end()}; @@ -534,6 +540,10 @@ class IdTable { AD_CONTRACT_CHECK(std::unique(check.begin(), check.end()) == check.end()); AD_CONTRACT_CHECK(!subset.empty() && subset.back() < numColumns()); + // If the number of columns is statically fixed, then only a permutation of + // the columns and not a real subset is allowed. + AD_CONTRACT_CHECK(isDynamic || subset.size() == NumColumns); + Data newData; newData.reserve(subset.size()); std::ranges::for_each(subset, [this, &newData](ColumnIndex colIdx) { diff --git a/src/engine/idTable/IdTableRow.h b/src/engine/idTable/IdTableRow.h index d28d76c696..21294df659 100644 --- a/src/engine/idTable/IdTableRow.h +++ b/src/engine/idTable/IdTableRow.h @@ -85,6 +85,14 @@ class Row { friend void swap(Row& a, Row& b) { std::swap(a.data_, b.data_); } bool operator==(const Row& other) const = default; + + // Convert from a static `RowReference` to a `std::array` (makes a copy). + explicit operator std::array() const + requires(numStaticColumns != 0) { + std::array result; + std::ranges::copy(*this, result.begin()); + return result; + } }; // The following two classes store a reference to a row in the underlying @@ -120,7 +128,7 @@ class RowReferenceImpl { public: static constexpr bool isConst = isConstTag == ad_utility::IsConst::True; using TablePtr = std::conditional_t; - using T = typename Table::value_type; + using T = typename Table::single_value_type; static constexpr int numStaticColumns = Table::numStaticColumns; // Grant the `IdTable` class access to the internal details. diff --git a/src/global/SpecialIds.h b/src/global/SpecialIds.h index d96fcdebfd..e28aa1ef02 100644 --- a/src/global/SpecialIds.h +++ b/src/global/SpecialIds.h @@ -31,6 +31,16 @@ static const inline ad_utility::HashMap specialIds = []() { AD_CORRECTNESS_CHECK(uniqueIds.size() == result.size()); return result; }(); + +// Return the [lowerBound, upperBound) for the special Ids. +// This range can be used to filter them out in cases where we want to ignore +// triples that were added by QLever for internal reasons. +static constexpr std::pair getBoundsForSpecialIds() { + constexpr auto upperBound = Id::makeFromBool(false); + static_assert(static_cast(Datatype::Undefined) == 0); + static_assert(upperBound.getBits() == 1UL << Id::numDataBits); + return {Id::fromBits(1), upperBound}; +} } // namespace qlever #endif // QLEVER_SPECIALIDS_H diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 1e58a23291..d73f0a1a36 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -192,6 +192,9 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( auto isQleverInternalId) { auto&& [hasPatternPredicateSortedByPSO, secondSorter] = sortersFromPatternCreator; + // We need the patterns twice: once for the additional column, and once for + // the additional permutation. + hasPatternPredicateSortedByPSO->moveResultOnMerge() = false; // The column with index 1 always is `has-predicate` and is not needed here. // Note that the order of the columns during index building is alwasy `SPO`, // but the sorting might be different (PSO in this case). @@ -259,6 +262,19 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( makeSorterPtr("third"); createSecondPermutationPair(NumColumnsIndexBuilding + 2, isQleverInternalId, std::move(blockGenerator), *thirdSorter); + // Add the `ql:has-pattern` predicate to the sorter such that it will become + // part of the PSO and POS permutation. + LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() + << " additional triples to the POS and PSO permutation for the " + "`ql:has-pattern` predicate ..." + << std::endl; + auto noPattern = Id::makeFromInt(NO_PATTERN); + static_assert(NumColumnsIndexBuilding == 3); + for (const auto& row : hasPatternPredicateSortedByPSO->sortedView()) { + // The repetition of the pattern index (`row[2]`) for the fourth column is + // useful for generic unit testing, but not needed otherwise. + thirdSorter->push(std::array{row[0], row[1], row[2], row[2], noPattern}); + } return thirdSorter; } // _____________________________________________________________________________ @@ -282,7 +298,10 @@ void IndexImpl::createFromFile(const string& filename) { writeConfiguration(); auto isQleverInternalId = [&indexBuilderData](const auto& id) { - return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id); + // The special internal IDs like `ql:has-pattern` (see `SpecialIds.h`) + // have the datatype `UNDEFINED`. + return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id) || + id.getDatatype() == Datatype::Undefined; }; // For the first permutation, perform a unique. @@ -754,6 +773,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { totalVocabularySize_ = vocab_.size() + vocab_.getExternalVocab().size(); LOG(DEBUG) << "Number of words in internal and external vocabulary: " << totalVocabularySize_ << std::endl; + pso_.loadFromDisk(onDiskBase_); pos_.loadFromDisk(onDiskBase_); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 4f57819157..8800d670a4 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -668,35 +669,43 @@ class IndexImpl { // index scan) and `GroupBy.cpp`. auto getIgnoredIdRanges(const Permutation::Enum permutation) const { std::vector> ignoredRanges; + ignoredRanges.emplace_back(qlever::getBoundsForSpecialIds()); auto literalRange = getVocab().prefix_range("\""); auto taggedPredicatesRange = getVocab().prefix_range("@"); auto internalEntitiesRange = getVocab().prefix_range(INTERNAL_ENTITIES_URI_PREFIX); - ignoredRanges.emplace_back( - Id::makeFromVocabIndex(internalEntitiesRange.first), - Id::makeFromVocabIndex(internalEntitiesRange.second)); + auto pushIgnoredRange = [&ignoredRanges](const auto& range) { + ignoredRanges.emplace_back(Id::makeFromVocabIndex(range.first), + Id::makeFromVocabIndex(range.second)); + }; + pushIgnoredRange(internalEntitiesRange); using enum Permutation::Enum; if (permutation == SPO || permutation == SOP) { - ignoredRanges.push_back({Id::makeFromVocabIndex(literalRange.first), - Id::makeFromVocabIndex(literalRange.second)}); + pushIgnoredRange(literalRange); } else if (permutation == PSO || permutation == POS) { - ignoredRanges.push_back( - {Id::makeFromVocabIndex(taggedPredicatesRange.first), - Id::makeFromVocabIndex(taggedPredicatesRange.second)}); + pushIgnoredRange(taggedPredicatesRange); } - auto isIllegalPredicateId = [=](Id predicateId) { + // A lambda that checks whether the `predicateId` is an internal ID like + // `ql:has-pattern` or `@en@rdfs:label`. + auto isInternalPredicateId = [internalEntitiesRange, + taggedPredicatesRange](Id predicateId) { + if (predicateId.getDatatype() == Datatype::Undefined) { + return true; + } + AD_CORRECTNESS_CHECK(predicateId.getDatatype() == Datatype::VocabIndex); auto idx = predicateId.getVocabIndex(); - return (idx >= internalEntitiesRange.first && - idx < internalEntitiesRange.second) || - (idx >= taggedPredicatesRange.first && - idx < taggedPredicatesRange.second); + auto isInRange = [idx](const auto& range) { + return range.first <= idx && idx < range.second; + }; + return (isInRange(internalEntitiesRange) || + isInRange(taggedPredicatesRange)); }; auto isTripleIgnored = [permutation, - isIllegalPredicateId](const auto& triple) { + isInternalPredicateId](const auto& triple) { // TODO: // A lot of code (especially for statistical queries in `GroupBy.cpp` and // the pattern trick) relies on this function being a noop for the `PSO` @@ -707,9 +716,9 @@ class IndexImpl { // be thoroughly reviewed. if (permutation == SPO || permutation == OPS) { // Predicates are always entities from the vocabulary. - return isIllegalPredicateId(triple[1]); + return isInternalPredicateId(triple[1]); } else if (permutation == SOP || permutation == OSP) { - return isIllegalPredicateId(triple[2]); + return isInternalPredicateId(triple[2]); } return false; }; diff --git a/src/parser/TripleComponent.h b/src/parser/TripleComponent.h index 03dd26253e..7d90ee1af3 100644 --- a/src/parser/TripleComponent.h +++ b/src/parser/TripleComponent.h @@ -14,6 +14,7 @@ #include "engine/LocalVocab.h" #include "global/Constants.h" #include "global/Id.h" +#include "global/SpecialIds.h" #include "parser/RdfEscaping.h" #include "parser/data/Variable.h" #include "util/Date.h" @@ -232,6 +233,8 @@ class TripleComponent { isString() ? getString() : getLiteral().rawContent(); if (vocabulary.getId(content, &idx)) { return Id::makeFromVocabIndex(idx); + } else if (qlever::specialIds.contains(content)) { + return qlever::specialIds.at(content); } else { return std::nullopt; } diff --git a/test/IdTableTest.cpp b/test/IdTableTest.cpp index 03e7cc1eea..b4fb384a72 100644 --- a/test/IdTableTest.cpp +++ b/test/IdTableTest.cpp @@ -974,6 +974,28 @@ TEST(IdTable, setColumnSubset) { ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2})); } +TEST(IdTableStatic, setColumnSubset) { + using IntTable = columnBasedIdTable::IdTable; + IntTable t; + t.push_back({0, 10, 20}); + t.push_back({1, 11, 21}); + t.push_back({2, 12, 22}); + t.setColumnSubset(std::array{ColumnIndex(2), ColumnIndex(0), ColumnIndex(1)}); + ASSERT_EQ(3, t.numColumns()); + ASSERT_EQ(3, t.numRows()); + ASSERT_THAT(t.getColumn(0), ::testing::ElementsAre(20, 21, 22)); + ASSERT_THAT(t.getColumn(1), ::testing::ElementsAre(0, 1, 2)); + ASSERT_THAT(t.getColumn(2), ::testing::ElementsAre(10, 11, 12)); + + // Duplicate columns are not allowed. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{0, 0, 1})); + // A column index is out of range. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2, 3})); + + // For static tables, we need a permutation, a real subset is not allowed. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2})); +} + TEST(IdTable, cornerCases) { using Dynamic = columnBasedIdTable::IdTable; { diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index cb876ad550..8c0159ff91 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -19,6 +19,8 @@ using namespace ad_utility::testing; +using ::testing::UnorderedElementsAre; + namespace { using ad_utility::source_location; auto lit = ad_utility::testing::tripleComponentLiteral; @@ -434,6 +436,7 @@ TEST(IndexTest, getIgnoredIdRanges) { // The range of all literals; auto literals = std::pair{firstLiteral, increment(lastLiteral)}; + auto specialIds = qlever::getBoundsForSpecialIds(); { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::POS); ASSERT_FALSE(lambda(std::array{label, firstLiteral, x})); @@ -443,10 +446,9 @@ TEST(IndexTest, getIgnoredIdRanges) { // `ranges`. ASSERT_FALSE(lambda(std::array{enLabel, firstLiteral, x})); ASSERT_FALSE(lambda(std::array{x, x, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], predicatesWithLangtag); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, predicatesWithLangtag, + specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::PSO); @@ -457,46 +459,41 @@ TEST(IndexTest, getIgnoredIdRanges) { // `ranges`. ASSERT_FALSE(lambda(std::array{enLabel, x, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, x, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], predicatesWithLangtag); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, predicatesWithLangtag, + specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::SOP); ASSERT_TRUE(lambda(std::array{x, firstLiteral, enLabel})); ASSERT_FALSE(lambda(std::array{x, firstLiteral, label})); ASSERT_FALSE(lambda(std::array{x, x, label})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], literals); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, literals, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::SPO); ASSERT_TRUE(lambda(std::array{x, enLabel, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, label, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, label, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], literals); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, literals, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::OSP); ASSERT_TRUE(lambda(std::array{firstLiteral, x, enLabel})); ASSERT_FALSE(lambda(std::array{firstLiteral, x, label})); ASSERT_FALSE(lambda(std::array{x, x, label})); - ASSERT_EQ(1u, ranges.size()); - ASSERT_EQ(ranges[0], internalEntities); + EXPECT_THAT(ranges, UnorderedElementsAre(internalEntities, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::OPS); ASSERT_TRUE(lambda(std::array{firstLiteral, enLabel, x})); ASSERT_FALSE(lambda(std::array{firstLiteral, label, x})); ASSERT_FALSE(lambda(std::array{x, label, x})); - ASSERT_EQ(1u, ranges.size()); - ASSERT_EQ(ranges[0], internalEntities); + auto hasPattern = qlever::specialIds.at(HAS_PATTERN_PREDICATE); + ASSERT_TRUE(lambda(std::array{firstLiteral, hasPattern, x})); + EXPECT_THAT(ranges, UnorderedElementsAre(internalEntities, specialIds)); } } @@ -520,29 +517,32 @@ TEST(IndexTest, NumDistinctEntities) { auto predicates = index.numDistinctPredicates(); EXPECT_EQ(predicates.normal_, 2); - // One added predicate is `ql:langtag` and one added predicate for - // each combination of predicate+language that is actually used (e.g. - // `@en@label`). - EXPECT_EQ(predicates.internal_, 2); + // The added predicates are `ql:has-pattern`, `ql:langtag`, and one added + // predicate for each combination of predicate+language that is actually used + // (e.g. `@en@label`). + EXPECT_EQ(predicates.internal_, 3); EXPECT_EQ(predicates, index.numDistinctCol0(Permutation::PSO)); EXPECT_EQ(predicates, index.numDistinctCol0(Permutation::POS)); auto objects = index.numDistinctObjects(); EXPECT_EQ(objects.normal_, 7); - // One added object for each language that is used + // One added object for each language that is used. + // Note: The pattern indices from the `ql:has-pattern` predicate are currently + // not part of `objects.internal_`, but they are also not very important. EXPECT_EQ(objects.internal_, 1); EXPECT_EQ(objects, index.numDistinctCol0(Permutation::OSP)); EXPECT_EQ(objects, index.numDistinctCol0(Permutation::OPS)); auto numTriples = index.numTriples(); EXPECT_EQ(numTriples.normal_, 7); - // Two added triples for each triple that has an object with a language tag. - EXPECT_EQ(numTriples.internal_, 2); + // Two added triples for each triple that has an object with a language tag + // and one triple per subject for the pattern. + EXPECT_EQ(numTriples.internal_, 5); auto multiplicities = index.getMultiplicities(Permutation::SPO); - EXPECT_FLOAT_EQ(multiplicities[0], 9.0 / 4.0); - EXPECT_FLOAT_EQ(multiplicities[1], 9.0 / 4.0); - EXPECT_FLOAT_EQ(multiplicities[2], 9.0 / 8.0); + EXPECT_FLOAT_EQ(multiplicities[0], 12.0 / 4.0); + EXPECT_FLOAT_EQ(multiplicities[1], 12.0 / 5.0); + EXPECT_FLOAT_EQ(multiplicities[2], 12.0 / 8.0); multiplicities = index.getMultiplicities("", Permutation::SPO); EXPECT_FLOAT_EQ(multiplicities[0], 2.5); diff --git a/test/TripleComponentTest.cpp b/test/TripleComponentTest.cpp index 44f38be43e..e7792280aa 100644 --- a/test/TripleComponentTest.cpp +++ b/test/TripleComponentTest.cpp @@ -157,6 +157,10 @@ TEST(TripleComponent, toValueId) { tc = 42; ASSERT_EQ(tc.toValueIdIfNotString().value(), I(42)); + + tc = HAS_PATTERN_PREDICATE; + ASSERT_EQ(tc.toValueId(vocab).value(), + qlever::specialIds.at(HAS_PATTERN_PREDICATE)); } TEST(TripleComponent, settingVariablesAsStringsIsIllegal) { diff --git a/test/engine/idTable/CompressedExternalIdTableTest.cpp b/test/engine/idTable/CompressedExternalIdTableTest.cpp index 9847473b2c..5d679ef814 100644 --- a/test/engine/idTable/CompressedExternalIdTableTest.cpp +++ b/test/engine/idTable/CompressedExternalIdTableTest.cpp @@ -15,21 +15,6 @@ using ad_utility::source_location; using namespace ad_utility::memory_literals; namespace { -// Implementation of a class that inherits from `IdTable` but is copyable -// (convenient for testing). -template -using TableImpl = std::conditional_t>; -template -class CopyableIdTable : public TableImpl { - public: - using Base = TableImpl; - using Base::Base; - CopyableIdTable(const CopyableIdTable& rhs) : Base{rhs.clone()} {} - CopyableIdTable& operator=(const CopyableIdTable& rhs) { - static_cast(*this) = rhs.clone(); - return *this; - } -}; // From a `generator` that yields `IdTable`s, create a single `IdTable` that is // the concatenation of all the yielded tables. @@ -93,9 +78,10 @@ TEST(CompressedExternalIdTable, compressedExternalIdTableWriter) { } template -void testExternalSorter(size_t numDynamicColumns, size_t numRows, - ad_utility::MemorySize memoryToUse, - source_location l = source_location::current()) { +void testExternalSorterImpl(size_t numDynamicColumns, size_t numRows, + ad_utility::MemorySize memoryToUse, + bool mergeMultipleTimes, + source_location l = source_location::current()) { auto tr = generateLocationTrace(l); std::string filename = "idTableCompressedSorter.testExternalSorter.dat"; using namespace ad_utility::memory_literals; @@ -116,16 +102,38 @@ void testExternalSorter(size_t numDynamicColumns, size_t numRows, std::ranges::sort(randomTable, SortByOSP{}); - auto generator = writer.sortedView(); + if (mergeMultipleTimes) { + writer.moveResultOnMerge() = false; + } - using namespace ::testing; - auto result = - idTableFromRowGenerator(generator, numDynamicColumns); - ASSERT_THAT(result, Eq(randomTable)); + for (size_t k = 0; k < 5; ++k) { + auto generator = writer.sortedView(); + using namespace ::testing; + if (mergeMultipleTimes || k == 0) { + auto result = idTableFromRowGenerator( + generator, numDynamicColumns); + ASSERT_THAT(result, Eq(randomTable)) << "k = " << k; + } else { + EXPECT_ANY_THROW((idTableFromRowGenerator( + generator, numDynamicColumns))); + } + // We cannot access or change this value after the first merge. + EXPECT_ANY_THROW(writer.moveResultOnMerge()); + } writer.clear(); } } +template +void testExternalSorter(size_t numDynamicColumns, size_t numRows, + ad_utility::MemorySize memoryToUse, + source_location l = source_location::current()) { + testExternalSorterImpl(numDynamicColumns, numRows, + memoryToUse, true, l); + testExternalSorterImpl(numDynamicColumns, numRows, + memoryToUse, false, l); +} + TEST(CompressedExternalIdTable, sorterRandomInputs) { using namespace ad_utility::memory_literals; // Test for dynamic (<0>) and static(<3>) tables. diff --git a/test/util/IdTableHelpers.h b/test/util/IdTableHelpers.h index 8be0616f73..1f9eca2436 100644 --- a/test/util/IdTableHelpers.h +++ b/test/util/IdTableHelpers.h @@ -35,6 +35,22 @@ struct IdTableAndJoinColumn { size_t joinColumn; }; +// Implementation of a class that inherits from `IdTable` but is copyable +// (convenient for testing). +template +using TableImpl = std::conditional_t>; +template +class CopyableIdTable : public TableImpl { + public: + using Base = TableImpl; + using Base::Base; + CopyableIdTable(const CopyableIdTable& rhs) : Base{rhs.clone()} {} + CopyableIdTable& operator=(const CopyableIdTable& rhs) { + static_cast(*this) = rhs.clone(); + return *this; + } +}; + // For easier reading. We repeat that type combination so often, that this // will make things a lot easier in terms of reading and writing. using IntOrId = std::variant; diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index b0d888746a..44c69e1864 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -5,6 +5,7 @@ #include "../IndexTestHelpers.h" #include "./GTestHelpers.h" +#include "global/SpecialIds.h" #include "index/IndexImpl.h" namespace ad_utility::testing { @@ -70,6 +71,19 @@ void checkConsistencyBetweenOldAndNewPatterns(const Index& index) { auto scanResult = index.scan(col0Id, std::nullopt, permutation, std::array{ColumnIndex{2}, ColumnIndex{3}}, cancellationDummy); + auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); + auto scanResultHasPattern = + index.scan(hasPatternId, col0Id, Permutation::Enum::PSO, {}, + cancellationDummy); + // Each ID has at most one pattern, it can have none if it doesn't + // appear as a subject in the knowledge graph. + AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); + if (scanResultHasPattern.numRows() == 0) { + checkSingleElement(index, NO_PATTERN, col0Id); + } else { + checkSingleElement(index, scanResultHasPattern(0, 0).getInt(), + col0Id); + } ASSERT_EQ(scanResult.numColumns(), 4u); for (const auto& row : scanResult) { auto patternIdx = row[2].getInt(); @@ -153,6 +167,7 @@ Index makeTestIndex(const std::string& indexBasename, EXPECT_EQ(index.loadAllPermutations(), loadAllPermutations); EXPECT_EQ(index.usePatterns(), usePatterns); } + Index index{ad_utility::makeUnlimitedAllocator()}; index.usePatterns() = usePatterns; index.loadAllPermutations() = loadAllPermutations;