diff --git a/src/engine/idTable/CompressedExternalIdTable.h b/src/engine/idTable/CompressedExternalIdTable.h index 159f7d8072..0b6c197e5c 100644 --- a/src/engine/idTable/CompressedExternalIdTable.h +++ b/src/engine/idTable/CompressedExternalIdTable.h @@ -316,6 +316,10 @@ class CompressedExternalIdTableBase { CompressedExternalIdTableWriter writer_; std::future compressAndWriteFuture_; + // Store whether this table has previously already been iterated over (in + // which case this member becomes `false`). + std::atomic isFirstIteration_ = true; + [[no_unique_address]] BlockTransformation blockTransformation_{}; public: @@ -364,6 +368,7 @@ class CompressedExternalIdTableBase { } writer_.clear(); numBlocksPushed_ = 0; + isFirstIteration_ = true; } protected: @@ -401,6 +406,9 @@ class CompressedExternalIdTableBase { // until the pushing is actually finished, and return `true`. Using this // function allows for an efficient usage of this class for very small inputs. bool transformAndPushLastBlock() { + if (!isFirstIteration_) { + return numBlocksPushed_ != 0; + } // If we have pushed at least one (complete) block, then the last future // from pushing a block is still in flight. If we have never pushed a block, // then also the future cannot be valid. @@ -549,6 +557,9 @@ class CompressedExternalIdTableSorter // output phase. int numBufferedOutputBlocks_ = 4; + // See the `moveResultOnMerge()` getter function for documentation. + bool moveResultOnMerge_ = true; + public: // Constructor. CompressedExternalIdTableSorter( @@ -579,6 +590,18 @@ class CompressedExternalIdTableSorter // within this class. using Base::push; + // If set to `false` then the sorted result can be extracted multiple times. + // If set to `true` then the result is moved out and unusable after the first + // merge. In that case an exception will be thrown at the start of the second + // merge. + // Note: This mechanism gives a performance advantage for very small inputs + // that can be completely sorted in RAM. In that case we can avoid a copy of + // the sorted result. + bool& moveResultOnMerge() { + AD_CONTRACT_CHECK(this->isFirstIteration_); + return moveResultOnMerge_; + } + // Transition from the input phase, where `push()` can be called, to the // output phase and return a generator that yields the sorted elements one by // one. Either this function or the following function must be called exactly @@ -594,6 +617,8 @@ class CompressedExternalIdTableSorter requires(N == NumStaticCols || N == 0) cppcoro::generator> getSortedBlocks( std::optional blocksize = std::nullopt) { + // If we move the result out, there must only be a single merge phase. + AD_CONTRACT_CHECK(this->isFirstIteration_ || !this->moveResultOnMerge_); mergeIsActive_.store(true); // Explanation for the second argument: One block is buffered by this // generator, one block is buffered inside the `sortedBlocks` generator, so @@ -604,6 +629,7 @@ class CompressedExternalIdTableSorter std::max(1, numBufferedOutputBlocks_ - 2))) { co_yield block; } + this->isFirstIteration_ = false; mergeIsActive_.store(false); } @@ -637,8 +663,15 @@ class CompressedExternalIdTableSorter auto& block = this->currentBlock_; const auto blocksizeOutput = blocksize.value_or(block.numRows()); if (block.numRows() <= blocksizeOutput) { - co_yield std::move(this->currentBlock_).template toStatic(); + if (this->moveResultOnMerge_) { + co_yield std::move(this->currentBlock_).template toStatic(); + } else { + auto blockAsStatic = IdTableStatic( + this->currentBlock_.clone().template toStatic()); + co_yield blockAsStatic; + } } else { + // TODO Use `std::views::chunk`. for (size_t i = 0; i < block.numRows(); i += blocksizeOutput) { size_t upper = std::min(i + blocksizeOutput, block.numRows()); auto curBlock = IdTableStatic( diff --git a/src/engine/idTable/IdTable.h b/src/engine/idTable/IdTable.h index 0184b14cef..7d611c050c 100644 --- a/src/engine/idTable/IdTable.h +++ b/src/engine/idTable/IdTable.h @@ -124,7 +124,8 @@ class IdTable { static constexpr bool columnsAreAllocatable = std::is_constructible_v; - using value_type = T; + // The type of a single entry in a row. + using single_value_type = T; // Because of the column-major layout, the `row_type` (a value type that // stores the values of a single row) and the `row_reference` (a type that // refers to a specific row of a specific `IdTable`) are different. They are @@ -135,6 +136,11 @@ class IdTable { using row_reference = RowReference; using const_row_reference = RowReference; + // This alias is required to make the `IdTable` class work with advanced GTest + // features, because GTest uses `Container::value_type` directly instead of + // using `std::iterator_traits`. + using value_type = row_type; + private: // Assign shorter aliases for some types that are important for the correct // handling of the proxy reference, but that are not visible to the outside. @@ -526,7 +532,7 @@ class IdTable { // numColumns()` implies that the function applies a permutation to the table. // For example `setColumnSubset({1, 2, 0})` rotates the columns of a table // with three columns left by one element. - void setColumnSubset(std::span subset) requires isDynamic { + void setColumnSubset(std::span subset) { // First check that the `subset` is indeed a subset of the column // indices. std::vector check{subset.begin(), subset.end()}; @@ -534,6 +540,10 @@ class IdTable { AD_CONTRACT_CHECK(std::unique(check.begin(), check.end()) == check.end()); AD_CONTRACT_CHECK(!subset.empty() && subset.back() < numColumns()); + // If the number of columns is statically fixed, then only a permutation of + // the columns and not a real subset is allowed. + AD_CONTRACT_CHECK(isDynamic || subset.size() == NumColumns); + Data newData; newData.reserve(subset.size()); std::ranges::for_each(subset, [this, &newData](ColumnIndex colIdx) { diff --git a/src/engine/idTable/IdTableRow.h b/src/engine/idTable/IdTableRow.h index d28d76c696..21294df659 100644 --- a/src/engine/idTable/IdTableRow.h +++ b/src/engine/idTable/IdTableRow.h @@ -85,6 +85,14 @@ class Row { friend void swap(Row& a, Row& b) { std::swap(a.data_, b.data_); } bool operator==(const Row& other) const = default; + + // Convert from a static `RowReference` to a `std::array` (makes a copy). + explicit operator std::array() const + requires(numStaticColumns != 0) { + std::array result; + std::ranges::copy(*this, result.begin()); + return result; + } }; // The following two classes store a reference to a row in the underlying @@ -120,7 +128,7 @@ class RowReferenceImpl { public: static constexpr bool isConst = isConstTag == ad_utility::IsConst::True; using TablePtr = std::conditional_t; - using T = typename Table::value_type; + using T = typename Table::single_value_type; static constexpr int numStaticColumns = Table::numStaticColumns; // Grant the `IdTable` class access to the internal details. diff --git a/src/global/SpecialIds.h b/src/global/SpecialIds.h index d96fcdebfd..e28aa1ef02 100644 --- a/src/global/SpecialIds.h +++ b/src/global/SpecialIds.h @@ -31,6 +31,16 @@ static const inline ad_utility::HashMap specialIds = []() { AD_CORRECTNESS_CHECK(uniqueIds.size() == result.size()); return result; }(); + +// Return the [lowerBound, upperBound) for the special Ids. +// This range can be used to filter them out in cases where we want to ignore +// triples that were added by QLever for internal reasons. +static constexpr std::pair getBoundsForSpecialIds() { + constexpr auto upperBound = Id::makeFromBool(false); + static_assert(static_cast(Datatype::Undefined) == 0); + static_assert(upperBound.getBits() == 1UL << Id::numDataBits); + return {Id::fromBits(1), upperBound}; +} } // namespace qlever #endif // QLEVER_SPECIALIDS_H diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 1e58a23291..d73f0a1a36 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -192,6 +192,9 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( auto isQleverInternalId) { auto&& [hasPatternPredicateSortedByPSO, secondSorter] = sortersFromPatternCreator; + // We need the patterns twice: once for the additional column, and once for + // the additional permutation. + hasPatternPredicateSortedByPSO->moveResultOnMerge() = false; // The column with index 1 always is `has-predicate` and is not needed here. // Note that the order of the columns during index building is alwasy `SPO`, // but the sorting might be different (PSO in this case). @@ -259,6 +262,19 @@ std::unique_ptr> IndexImpl::buildOspWithPatterns( makeSorterPtr("third"); createSecondPermutationPair(NumColumnsIndexBuilding + 2, isQleverInternalId, std::move(blockGenerator), *thirdSorter); + // Add the `ql:has-pattern` predicate to the sorter such that it will become + // part of the PSO and POS permutation. + LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size() + << " additional triples to the POS and PSO permutation for the " + "`ql:has-pattern` predicate ..." + << std::endl; + auto noPattern = Id::makeFromInt(NO_PATTERN); + static_assert(NumColumnsIndexBuilding == 3); + for (const auto& row : hasPatternPredicateSortedByPSO->sortedView()) { + // The repetition of the pattern index (`row[2]`) for the fourth column is + // useful for generic unit testing, but not needed otherwise. + thirdSorter->push(std::array{row[0], row[1], row[2], row[2], noPattern}); + } return thirdSorter; } // _____________________________________________________________________________ @@ -282,7 +298,10 @@ void IndexImpl::createFromFile(const string& filename) { writeConfiguration(); auto isQleverInternalId = [&indexBuilderData](const auto& id) { - return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id); + // The special internal IDs like `ql:has-pattern` (see `SpecialIds.h`) + // have the datatype `UNDEFINED`. + return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id) || + id.getDatatype() == Datatype::Undefined; }; // For the first permutation, perform a unique. @@ -754,6 +773,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) { totalVocabularySize_ = vocab_.size() + vocab_.getExternalVocab().size(); LOG(DEBUG) << "Number of words in internal and external vocabulary: " << totalVocabularySize_ << std::endl; + pso_.loadFromDisk(onDiskBase_); pos_.loadFromDisk(onDiskBase_); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 4f57819157..8800d670a4 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -668,35 +669,43 @@ class IndexImpl { // index scan) and `GroupBy.cpp`. auto getIgnoredIdRanges(const Permutation::Enum permutation) const { std::vector> ignoredRanges; + ignoredRanges.emplace_back(qlever::getBoundsForSpecialIds()); auto literalRange = getVocab().prefix_range("\""); auto taggedPredicatesRange = getVocab().prefix_range("@"); auto internalEntitiesRange = getVocab().prefix_range(INTERNAL_ENTITIES_URI_PREFIX); - ignoredRanges.emplace_back( - Id::makeFromVocabIndex(internalEntitiesRange.first), - Id::makeFromVocabIndex(internalEntitiesRange.second)); + auto pushIgnoredRange = [&ignoredRanges](const auto& range) { + ignoredRanges.emplace_back(Id::makeFromVocabIndex(range.first), + Id::makeFromVocabIndex(range.second)); + }; + pushIgnoredRange(internalEntitiesRange); using enum Permutation::Enum; if (permutation == SPO || permutation == SOP) { - ignoredRanges.push_back({Id::makeFromVocabIndex(literalRange.first), - Id::makeFromVocabIndex(literalRange.second)}); + pushIgnoredRange(literalRange); } else if (permutation == PSO || permutation == POS) { - ignoredRanges.push_back( - {Id::makeFromVocabIndex(taggedPredicatesRange.first), - Id::makeFromVocabIndex(taggedPredicatesRange.second)}); + pushIgnoredRange(taggedPredicatesRange); } - auto isIllegalPredicateId = [=](Id predicateId) { + // A lambda that checks whether the `predicateId` is an internal ID like + // `ql:has-pattern` or `@en@rdfs:label`. + auto isInternalPredicateId = [internalEntitiesRange, + taggedPredicatesRange](Id predicateId) { + if (predicateId.getDatatype() == Datatype::Undefined) { + return true; + } + AD_CORRECTNESS_CHECK(predicateId.getDatatype() == Datatype::VocabIndex); auto idx = predicateId.getVocabIndex(); - return (idx >= internalEntitiesRange.first && - idx < internalEntitiesRange.second) || - (idx >= taggedPredicatesRange.first && - idx < taggedPredicatesRange.second); + auto isInRange = [idx](const auto& range) { + return range.first <= idx && idx < range.second; + }; + return (isInRange(internalEntitiesRange) || + isInRange(taggedPredicatesRange)); }; auto isTripleIgnored = [permutation, - isIllegalPredicateId](const auto& triple) { + isInternalPredicateId](const auto& triple) { // TODO: // A lot of code (especially for statistical queries in `GroupBy.cpp` and // the pattern trick) relies on this function being a noop for the `PSO` @@ -707,9 +716,9 @@ class IndexImpl { // be thoroughly reviewed. if (permutation == SPO || permutation == OPS) { // Predicates are always entities from the vocabulary. - return isIllegalPredicateId(triple[1]); + return isInternalPredicateId(triple[1]); } else if (permutation == SOP || permutation == OSP) { - return isIllegalPredicateId(triple[2]); + return isInternalPredicateId(triple[2]); } return false; }; diff --git a/src/parser/TripleComponent.h b/src/parser/TripleComponent.h index 03dd26253e..7d90ee1af3 100644 --- a/src/parser/TripleComponent.h +++ b/src/parser/TripleComponent.h @@ -14,6 +14,7 @@ #include "engine/LocalVocab.h" #include "global/Constants.h" #include "global/Id.h" +#include "global/SpecialIds.h" #include "parser/RdfEscaping.h" #include "parser/data/Variable.h" #include "util/Date.h" @@ -232,6 +233,8 @@ class TripleComponent { isString() ? getString() : getLiteral().rawContent(); if (vocabulary.getId(content, &idx)) { return Id::makeFromVocabIndex(idx); + } else if (qlever::specialIds.contains(content)) { + return qlever::specialIds.at(content); } else { return std::nullopt; } diff --git a/test/IdTableTest.cpp b/test/IdTableTest.cpp index 03e7cc1eea..b4fb384a72 100644 --- a/test/IdTableTest.cpp +++ b/test/IdTableTest.cpp @@ -974,6 +974,28 @@ TEST(IdTable, setColumnSubset) { ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2})); } +TEST(IdTableStatic, setColumnSubset) { + using IntTable = columnBasedIdTable::IdTable; + IntTable t; + t.push_back({0, 10, 20}); + t.push_back({1, 11, 21}); + t.push_back({2, 12, 22}); + t.setColumnSubset(std::array{ColumnIndex(2), ColumnIndex(0), ColumnIndex(1)}); + ASSERT_EQ(3, t.numColumns()); + ASSERT_EQ(3, t.numRows()); + ASSERT_THAT(t.getColumn(0), ::testing::ElementsAre(20, 21, 22)); + ASSERT_THAT(t.getColumn(1), ::testing::ElementsAre(0, 1, 2)); + ASSERT_THAT(t.getColumn(2), ::testing::ElementsAre(10, 11, 12)); + + // Duplicate columns are not allowed. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{0, 0, 1})); + // A column index is out of range. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2, 3})); + + // For static tables, we need a permutation, a real subset is not allowed. + ASSERT_ANY_THROW(t.setColumnSubset(std::vector{1, 2})); +} + TEST(IdTable, cornerCases) { using Dynamic = columnBasedIdTable::IdTable; { diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index cb876ad550..8c0159ff91 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -19,6 +19,8 @@ using namespace ad_utility::testing; +using ::testing::UnorderedElementsAre; + namespace { using ad_utility::source_location; auto lit = ad_utility::testing::tripleComponentLiteral; @@ -434,6 +436,7 @@ TEST(IndexTest, getIgnoredIdRanges) { // The range of all literals; auto literals = std::pair{firstLiteral, increment(lastLiteral)}; + auto specialIds = qlever::getBoundsForSpecialIds(); { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::POS); ASSERT_FALSE(lambda(std::array{label, firstLiteral, x})); @@ -443,10 +446,9 @@ TEST(IndexTest, getIgnoredIdRanges) { // `ranges`. ASSERT_FALSE(lambda(std::array{enLabel, firstLiteral, x})); ASSERT_FALSE(lambda(std::array{x, x, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], predicatesWithLangtag); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, predicatesWithLangtag, + specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::PSO); @@ -457,46 +459,41 @@ TEST(IndexTest, getIgnoredIdRanges) { // `ranges`. ASSERT_FALSE(lambda(std::array{enLabel, x, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, x, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], predicatesWithLangtag); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, predicatesWithLangtag, + specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::SOP); ASSERT_TRUE(lambda(std::array{x, firstLiteral, enLabel})); ASSERT_FALSE(lambda(std::array{x, firstLiteral, label})); ASSERT_FALSE(lambda(std::array{x, x, label})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], literals); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, literals, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::SPO); ASSERT_TRUE(lambda(std::array{x, enLabel, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, label, firstLiteral})); ASSERT_FALSE(lambda(std::array{x, label, x})); - ASSERT_EQ(2u, ranges.size()); - - ASSERT_EQ(ranges[0], internalEntities); - ASSERT_EQ(ranges[1], literals); + EXPECT_THAT(ranges, + UnorderedElementsAre(internalEntities, literals, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::OSP); ASSERT_TRUE(lambda(std::array{firstLiteral, x, enLabel})); ASSERT_FALSE(lambda(std::array{firstLiteral, x, label})); ASSERT_FALSE(lambda(std::array{x, x, label})); - ASSERT_EQ(1u, ranges.size()); - ASSERT_EQ(ranges[0], internalEntities); + EXPECT_THAT(ranges, UnorderedElementsAre(internalEntities, specialIds)); } { auto [ranges, lambda] = index.getIgnoredIdRanges(Permutation::OPS); ASSERT_TRUE(lambda(std::array{firstLiteral, enLabel, x})); ASSERT_FALSE(lambda(std::array{firstLiteral, label, x})); ASSERT_FALSE(lambda(std::array{x, label, x})); - ASSERT_EQ(1u, ranges.size()); - ASSERT_EQ(ranges[0], internalEntities); + auto hasPattern = qlever::specialIds.at(HAS_PATTERN_PREDICATE); + ASSERT_TRUE(lambda(std::array{firstLiteral, hasPattern, x})); + EXPECT_THAT(ranges, UnorderedElementsAre(internalEntities, specialIds)); } } @@ -520,29 +517,32 @@ TEST(IndexTest, NumDistinctEntities) { auto predicates = index.numDistinctPredicates(); EXPECT_EQ(predicates.normal_, 2); - // One added predicate is `ql:langtag` and one added predicate for - // each combination of predicate+language that is actually used (e.g. - // `@en@label`). - EXPECT_EQ(predicates.internal_, 2); + // The added predicates are `ql:has-pattern`, `ql:langtag`, and one added + // predicate for each combination of predicate+language that is actually used + // (e.g. `@en@label`). + EXPECT_EQ(predicates.internal_, 3); EXPECT_EQ(predicates, index.numDistinctCol0(Permutation::PSO)); EXPECT_EQ(predicates, index.numDistinctCol0(Permutation::POS)); auto objects = index.numDistinctObjects(); EXPECT_EQ(objects.normal_, 7); - // One added object for each language that is used + // One added object for each language that is used. + // Note: The pattern indices from the `ql:has-pattern` predicate are currently + // not part of `objects.internal_`, but they are also not very important. EXPECT_EQ(objects.internal_, 1); EXPECT_EQ(objects, index.numDistinctCol0(Permutation::OSP)); EXPECT_EQ(objects, index.numDistinctCol0(Permutation::OPS)); auto numTriples = index.numTriples(); EXPECT_EQ(numTriples.normal_, 7); - // Two added triples for each triple that has an object with a language tag. - EXPECT_EQ(numTriples.internal_, 2); + // Two added triples for each triple that has an object with a language tag + // and one triple per subject for the pattern. + EXPECT_EQ(numTriples.internal_, 5); auto multiplicities = index.getMultiplicities(Permutation::SPO); - EXPECT_FLOAT_EQ(multiplicities[0], 9.0 / 4.0); - EXPECT_FLOAT_EQ(multiplicities[1], 9.0 / 4.0); - EXPECT_FLOAT_EQ(multiplicities[2], 9.0 / 8.0); + EXPECT_FLOAT_EQ(multiplicities[0], 12.0 / 4.0); + EXPECT_FLOAT_EQ(multiplicities[1], 12.0 / 5.0); + EXPECT_FLOAT_EQ(multiplicities[2], 12.0 / 8.0); multiplicities = index.getMultiplicities("", Permutation::SPO); EXPECT_FLOAT_EQ(multiplicities[0], 2.5); diff --git a/test/TripleComponentTest.cpp b/test/TripleComponentTest.cpp index 44f38be43e..e7792280aa 100644 --- a/test/TripleComponentTest.cpp +++ b/test/TripleComponentTest.cpp @@ -157,6 +157,10 @@ TEST(TripleComponent, toValueId) { tc = 42; ASSERT_EQ(tc.toValueIdIfNotString().value(), I(42)); + + tc = HAS_PATTERN_PREDICATE; + ASSERT_EQ(tc.toValueId(vocab).value(), + qlever::specialIds.at(HAS_PATTERN_PREDICATE)); } TEST(TripleComponent, settingVariablesAsStringsIsIllegal) { diff --git a/test/engine/idTable/CompressedExternalIdTableTest.cpp b/test/engine/idTable/CompressedExternalIdTableTest.cpp index 9847473b2c..5d679ef814 100644 --- a/test/engine/idTable/CompressedExternalIdTableTest.cpp +++ b/test/engine/idTable/CompressedExternalIdTableTest.cpp @@ -15,21 +15,6 @@ using ad_utility::source_location; using namespace ad_utility::memory_literals; namespace { -// Implementation of a class that inherits from `IdTable` but is copyable -// (convenient for testing). -template -using TableImpl = std::conditional_t>; -template -class CopyableIdTable : public TableImpl { - public: - using Base = TableImpl; - using Base::Base; - CopyableIdTable(const CopyableIdTable& rhs) : Base{rhs.clone()} {} - CopyableIdTable& operator=(const CopyableIdTable& rhs) { - static_cast(*this) = rhs.clone(); - return *this; - } -}; // From a `generator` that yields `IdTable`s, create a single `IdTable` that is // the concatenation of all the yielded tables. @@ -93,9 +78,10 @@ TEST(CompressedExternalIdTable, compressedExternalIdTableWriter) { } template -void testExternalSorter(size_t numDynamicColumns, size_t numRows, - ad_utility::MemorySize memoryToUse, - source_location l = source_location::current()) { +void testExternalSorterImpl(size_t numDynamicColumns, size_t numRows, + ad_utility::MemorySize memoryToUse, + bool mergeMultipleTimes, + source_location l = source_location::current()) { auto tr = generateLocationTrace(l); std::string filename = "idTableCompressedSorter.testExternalSorter.dat"; using namespace ad_utility::memory_literals; @@ -116,16 +102,38 @@ void testExternalSorter(size_t numDynamicColumns, size_t numRows, std::ranges::sort(randomTable, SortByOSP{}); - auto generator = writer.sortedView(); + if (mergeMultipleTimes) { + writer.moveResultOnMerge() = false; + } - using namespace ::testing; - auto result = - idTableFromRowGenerator(generator, numDynamicColumns); - ASSERT_THAT(result, Eq(randomTable)); + for (size_t k = 0; k < 5; ++k) { + auto generator = writer.sortedView(); + using namespace ::testing; + if (mergeMultipleTimes || k == 0) { + auto result = idTableFromRowGenerator( + generator, numDynamicColumns); + ASSERT_THAT(result, Eq(randomTable)) << "k = " << k; + } else { + EXPECT_ANY_THROW((idTableFromRowGenerator( + generator, numDynamicColumns))); + } + // We cannot access or change this value after the first merge. + EXPECT_ANY_THROW(writer.moveResultOnMerge()); + } writer.clear(); } } +template +void testExternalSorter(size_t numDynamicColumns, size_t numRows, + ad_utility::MemorySize memoryToUse, + source_location l = source_location::current()) { + testExternalSorterImpl(numDynamicColumns, numRows, + memoryToUse, true, l); + testExternalSorterImpl(numDynamicColumns, numRows, + memoryToUse, false, l); +} + TEST(CompressedExternalIdTable, sorterRandomInputs) { using namespace ad_utility::memory_literals; // Test for dynamic (<0>) and static(<3>) tables. diff --git a/test/util/IdTableHelpers.h b/test/util/IdTableHelpers.h index 8be0616f73..1f9eca2436 100644 --- a/test/util/IdTableHelpers.h +++ b/test/util/IdTableHelpers.h @@ -35,6 +35,22 @@ struct IdTableAndJoinColumn { size_t joinColumn; }; +// Implementation of a class that inherits from `IdTable` but is copyable +// (convenient for testing). +template +using TableImpl = std::conditional_t>; +template +class CopyableIdTable : public TableImpl { + public: + using Base = TableImpl; + using Base::Base; + CopyableIdTable(const CopyableIdTable& rhs) : Base{rhs.clone()} {} + CopyableIdTable& operator=(const CopyableIdTable& rhs) { + static_cast(*this) = rhs.clone(); + return *this; + } +}; + // For easier reading. We repeat that type combination so often, that this // will make things a lot easier in terms of reading and writing. using IntOrId = std::variant; diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index b0d888746a..44c69e1864 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -5,6 +5,7 @@ #include "../IndexTestHelpers.h" #include "./GTestHelpers.h" +#include "global/SpecialIds.h" #include "index/IndexImpl.h" namespace ad_utility::testing { @@ -70,6 +71,19 @@ void checkConsistencyBetweenOldAndNewPatterns(const Index& index) { auto scanResult = index.scan(col0Id, std::nullopt, permutation, std::array{ColumnIndex{2}, ColumnIndex{3}}, cancellationDummy); + auto hasPatternId = qlever::specialIds.at(HAS_PATTERN_PREDICATE); + auto scanResultHasPattern = + index.scan(hasPatternId, col0Id, Permutation::Enum::PSO, {}, + cancellationDummy); + // Each ID has at most one pattern, it can have none if it doesn't + // appear as a subject in the knowledge graph. + AD_CORRECTNESS_CHECK(scanResultHasPattern.numRows() <= 1); + if (scanResultHasPattern.numRows() == 0) { + checkSingleElement(index, NO_PATTERN, col0Id); + } else { + checkSingleElement(index, scanResultHasPattern(0, 0).getInt(), + col0Id); + } ASSERT_EQ(scanResult.numColumns(), 4u); for (const auto& row : scanResult) { auto patternIdx = row[2].getInt(); @@ -153,6 +167,7 @@ Index makeTestIndex(const std::string& indexBasename, EXPECT_EQ(index.loadAllPermutations(), loadAllPermutations); EXPECT_EQ(index.usePatterns(), usePatterns); } + Index index{ad_utility::makeUnlimitedAllocator()}; index.usePatterns() = usePatterns; index.loadAllPermutations() = loadAllPermutations;