From 84a4bdf9ee790390ffaea5d8eeebfbb52f61f8c7 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 9 Jun 2023 16:30:57 +0200 Subject: [PATCH] Code for locating triples in an existing index This is the first part of a series of PRs split of from the large proof-of-concept PR https://github.com/ad-freiburg/qlever/pull/916, which realizes SPARQL 1.1 Update --- src/global/IdTriple.h | 18 ++ src/index/CMakeLists.txt | 1 + src/index/CompressedRelation.h | 1 + src/index/IndexMetaData.h | 20 +- src/index/LocatedTriples.cpp | 349 +++++++++++++++++++++++++++++++++ src/index/LocatedTriples.h | 196 ++++++++++++++++++ src/index/MetaDataHandler.h | 51 +++-- test/CMakeLists.txt | 2 + test/LocatedTriplesTest.cpp | 173 ++++++++++++++++ 9 files changed, 789 insertions(+), 22 deletions(-) create mode 100644 src/global/IdTriple.h create mode 100644 src/index/LocatedTriples.cpp create mode 100644 src/index/LocatedTriples.h create mode 100644 test/LocatedTriplesTest.cpp diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h new file mode 100644 index 0000000000..0353b8c747 --- /dev/null +++ b/src/global/IdTriple.h @@ -0,0 +1,18 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include + +#include "global/Id.h" + +// Should we have an own class for this? We need this at several places. +using IdTriple = std::array; + +// Hash value for such triple. +template +H AbslHashValue(H h, const IdTriple& triple) { + return H::combine(std::move(h), triple[0], triple[1], triple[2]); +} diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4bbf53f647..fd65af2bd4 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(index VocabularyOnDisk.h VocabularyOnDisk.cpp IndexMetaData.h IndexMetaDataImpl.h MetaDataHandler.h + LocatedTriples.h LocatedTriples.cpp StxxlSortFunctors.h TextMetaData.cpp TextMetaData.h DocsDB.cpp DocsDB.h diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 3c6c5df80a..63d39a28ba 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -305,6 +305,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 4e3ef4b38f..3039c0ba28 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -1,6 +1,7 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + #pragma once #include @@ -13,14 +14,14 @@ #include #include -#include "../global/Id.h" -#include "../util/File.h" -#include "../util/HashMap.h" -#include "../util/MmapVector.h" -#include "../util/ReadableNumberFact.h" -#include "../util/Serializer/Serializer.h" -#include "./MetaDataHandler.h" -#include "CompressedRelation.h" +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "index/MetaDataHandler.h" +#include "util/File.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/ReadableNumberFact.h" +#include "util/Serializer/Serializer.h" using std::array; using std::pair; @@ -86,7 +87,10 @@ class IndexMetaData { // name and the variable name are terrible. // For each relation, its meta data. + public: MapType _data; + + private: // For each compressed block, its meta data. BlocksType _blockData; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp new file mode 100644 index 0000000000..acd6988675 --- /dev/null +++ b/src/index/LocatedTriples.cpp @@ -0,0 +1,349 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/LocatedTriples.h" + +#include + +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/Permutations.h" + +// ____________________________________________________________________________ +LocatedTriple LocatedTriple::locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation) { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: Since `_col2LastId` has been added to `CompressedBlockMetadata`, this + // can be computed without having to decompress any blocks. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block.col0LastId_ < triple[0]) { + return true; + } else if (block.col0LastId_ == triple[0]) { + if (block.col1LastId_ < triple[1]) { + return true; + } else if (block.col1LastId_ == triple[1]) { + return block.col2LastId_ < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set + // to `false`. + LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; + + // If all `Id`s from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and the special row index + // `NO_ROW_INDEX` (see how this is considered in `mergeTriples`). + if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); + return locatedTriple; + } + + // Read and decompress the block. + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Find the smallest relation `Id` that is not smaller than `id1` and get its + // metadata and the position of the first and last triple with that `Id` in + // the block. + // + // IMPORTANT: If relation `id1` exists in the index, but our triple is larger + // than all triples of that relation in the index and the last triple of that + // relation ends a block, then our block search above (correctly) landed us at + // the next block. We can detect this by checking whether the first relation + // `Id` of the block is larger than `id1` and then we should get the metadata + // for the `Id` and not for `id1` (which would pertain to a previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, + // which is relevant in the rare case where a triple is inserted with an + // `Id` for predicate that is not a new `Id`, but has not been used for a + // predicate in the original index. + // + // NOTE: Since we have already handled the case, where all `Id`s in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->col0FirstId_ > id1 ? matchingBlock->col0FirstId_ : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata.offsetInBlock_; + size_t offsetEnd = offsetBegin + relationMetadata.numRows_; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where + // we are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger `Id` and the position of the first + // triple of that relation is exactly the position we are looking for. + if (id == id1) { + locatedTriple.rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. + // Note that our default for `existsInIndex` was set to `false` above. + const size_t& i = locatedTriple.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + locatedTriple.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + locatedTriple.rowIndexInBlock = offsetBegin; + } + + // Return the result. + return locatedTriple; +} + +// ____________________________________________________________________________ +template +std::pair LocatedTriplesPerBlock::numTriplesImpl( + size_t blockIndex, Id id1, Id id2) const { + // If no located triples for `blockIndex` exist, there is no entry in `map_`. + if (!map_.contains(blockIndex)) { + return {0, 0}; + } + + // Otherwise iterate over all located triples and count how many of them exist + // in the index ("to be deleted") and how many are new ("to be inserted"). + size_t countExists = 0; + size_t countNew = 0; + for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { + // Helper lambda for increasing the right counter. + auto increaseCountIf = [&](bool increase) { + if (increase) { + if (locatedTriple.existsInIndex) { + ++countExists; + } else { + ++countNew; + } + } + }; + // Increase depending on the mode. + if constexpr (matchMode == MatchMode::MatchAll) { + increaseCountIf(true); + } else if constexpr (matchMode == MatchMode::MatchId1) { + increaseCountIf(locatedTriple.id1 == id1); + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + increaseCountIf(locatedTriple.id1 == id1 && locatedTriple.id2 == id2); + } + } + return {countNew, countExists}; +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples( + size_t blockIndex) const { + return numTriplesImpl(blockIndex); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1) const { + return numTriplesImpl(blockIndex, id1); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1, + Id id2) const { + return numTriplesImpl(blockIndex, id1, id2); +} + +// ____________________________________________________________________________ +template +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + // This method should only be called if there are located triples in the + // specified block. + AD_CONTRACT_CHECK(map_.contains(blockIndex)); + + // The special case `block == std::nullopt` (write only located triples to + // `result`) is only allowed, when `id1` or `id1` and `id2` are specified. + AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); + + // If `rowIndexInBlockEnd` has the default value (see `LocatedTriples.h`), the + // intended semantics is that we read the whole block (note that we can't have + // a default value that depends on the values of previous arguments). + if (rowIndexInBlockEnd == LocatedTriple::NO_ROW_INDEX && block.has_value()) { + rowIndexInBlockEnd = block.value().size(); + } + + // Check that `rowIndexInBlockBegin` and `rowIndexInBlockEnd` define a valid + // and non-emtpy range and that it is a subrange of `block` (unless the latter + // is `std::nullopt`). + if (block.has_value()) { + AD_CONTRACT_CHECK(rowIndexInBlockBegin < block.value().size()); + AD_CONTRACT_CHECK(rowIndexInBlockEnd <= block.value().size()); + } + AD_CONTRACT_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + + // If we restrict `id1` and `id2`, the index block and the result must have + // one column (for the `id3`). Otherwise, they must have two columns (for the + // `id2` and the `id3`). + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 1); + AD_CONTRACT_CHECK(result.numColumns() == 1); + } else { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 2); + AD_CONTRACT_CHECK(result.numColumns() == 2); + } + + auto resultEntry = result.begin() + offsetInResult; + const auto& locatedTriples = map_.at(blockIndex); + auto locatedTriple = locatedTriples.begin(); + + // Helper lambda that checks whether the given located triple should be + // considered, given the `matchMode`. + auto locatedTripleMatches = [&]() { + if constexpr (matchMode == MatchMode::MatchAll) { + return true; + } else if constexpr (matchMode == MatchMode::MatchId1) { + return locatedTriple->id1 == id1; + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + return locatedTriple->id1 == id1 && locatedTriple->id2 == id2; + } + }; + + // Advance to the first located triple in the specified range. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock < rowIndexInBlockBegin) { + ++locatedTriple; + } + + // Iterate over all located triples in the specified range. In the special + // case `block == std::nullopt` (only write located triples to `result`), all + // relevant located triples have `rowIndexInBlock == NO_ROW_INDEX` (here we + // need that `NO_ROW_INDEX` is the maximal `size_t` value minus one). + if (!block.has_value()) { + rowIndexInBlockBegin = LocatedTriple::NO_ROW_INDEX; + rowIndexInBlockEnd = rowIndexInBlockBegin + 1; + AD_CORRECTNESS_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + } + for (size_t rowIndex = rowIndexInBlockBegin; rowIndex < rowIndexInBlockEnd; + ++rowIndex) { + // Append triples that are marked for insertion at this `rowIndex` to the + // result. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == false) { + if (locatedTripleMatches()) { + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + (*resultEntry)[0] = locatedTriple->id3; + } else { + (*resultEntry)[0] = locatedTriple->id2; + (*resultEntry)[1] = locatedTriple->id3; + } + ++resultEntry; + } + ++locatedTriple; + } + + // Append the triple at this position to the result if and only if it is not + // marked for deletion and matches (also skip it if it does not match). + bool deleteThisEntry = false; + if (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == true) { + deleteThisEntry = locatedTripleMatches(); + ++locatedTriple; + } + if (block.has_value() && !deleteThisEntry) { + *resultEntry++ = block.value()[rowIndex]; + } + }; + + // Return the number of rows written to `result`. + return resultEntry - (result.begin() + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult) const { + return mergeTriples(blockIndex, std::move(block), result, + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, + Id::makeUndefined(), rowIndexInBlockBegin); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, id2, + rowIndexInBlockBegin, rowIndexInBlockEnd); +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { + os << "LT(" << lt.blockIndex << " " + << (lt.rowIndexInBlock == LocatedTriple::NO_ROW_INDEX + ? "NO_ROW_INDEX" + : std::to_string(lt.rowIndexInBlock)) + << " " << lt.id1 << " " << lt.id2 << " " << lt.id3 << " " + << lt.existsInIndex << ")"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts) { + os << "{"; + std::copy(lts.begin(), lts.end(), + std::ostream_iterator(std::cout, " ")); + os << "}"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { + for (auto [blockIndex, lts] : ltpb.map_) { + os << "Block #" << blockIndex << ": " << lts << std::endl; + } + return os; +} diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h new file mode 100644 index 0000000000..bb967bfe95 --- /dev/null +++ b/src/index/LocatedTriples.h @@ -0,0 +1,196 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include "engine/idTable/IdTable.h" +#include "global/IdTriple.h" +#include "util/HashMap.h" + +class Permutation; + +// A triple and its location in a particular permutation. +// +// If a triple is not contained in the permutation, the location is the location +// of the next larger triple (which may be in the next block or beyond the last +// block). For a detailed definition of all border cases, see the definition at +// the end of this file. +// +// NOTE: Technically, `blockIndex` and the `existsInIndex` are redundant in this +// record because they can be derived when the class is used. However, they are +// useful for testing, and for a small nuber of delta triples (think millions), +// space efficiency is not a significant issue for this class. +struct LocatedTriple { + // The index of the block and the location within that block, according to the + // definition above. + size_t blockIndex; + size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. + Id id1; + Id id2; + Id id3; + // Flag that is true if and only if the triple exists in the permutation. It + // is then equal to the triple at the position given by `blockIndex` and + // `rowIndexInBlock`. + bool existsInIndex; + + // Locate the given triple in the given permutation. + static LocatedTriple locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation); + + // Special row index for triples that belong to the previous block (see the + // definition for the location of a triple at the end of this file). + // + // NOTE: It is important that `NO_ROW_INDEX + 1 > NO_ROW_INDEX`, hence it is + // defined as `max() - 1` and not as the seemingly more natural `max()`. + static const size_t NO_ROW_INDEX = std::numeric_limits::max() - 1; +}; + +// A sorted set of located triples. In `LocatedTriplesPerBlock` below, we use +// this to store all located triples with the same `blockIndex`. +// +// NOTE: We could also overload `std::less` here, but the explicit specification +// of the order makes it clearer. +struct LocatedTripleCompare { + bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { + return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; + } +}; +using LocatedTriples = std::set; + +// Sorted sets of located triples, grouped by block. We use this to store all +// located triples for a permutation. +class LocatedTriplesPerBlock { + private: + // The total number of `LocatedTriple` objects stored (for all blocks). + size_t numTriples_ = 0; + + public: + // For each block with a non-empty set of located triples, the located triples + // in that block. + // + // NOTE: This is currently not private because we want access to + // `map_.size()`, `map_.clear()`, `map_.contains(...)`, and `map_.at(...)`. + // We could also make `LocatedTriplesPerBlock` a subclass of `HashMap`, but not sure whether that is good style. + ad_utility::HashMap map_; + + public: + // Get the number of located triples for the given block that match `id1` (if + // provided) and `id2` (if provided). The return value is a pair of numbers: + // first, the number of existing triples ("to be deleted") and second, the + // number of new triples ("to be inserted"). + std::pair numTriples(size_t blockIndex) const; + std::pair numTriples(size_t blockIndex, Id id1) const; + std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; + + // Merge located triples for `blockIndex` with the given index `block` and + // write to `result`, starting from position `offsetInResult`. Consider only + // located triples in the range specified by `rowIndexInBlockBegin` and + // `rowIndexInBlockEnd`. Consider only triples that match `id1` (if provided) + // and `id2` (if provided). Return the number of rows written to `result`. + // + // PRECONDITIONS: + // + // 1. The set of located triples for `blockIndex` must be non-empty. + // Otherwise, there is no need for merging and this method shouldn't be + // called for efficiency reasons. + // + // 2. It is the resposibility of the caller that there is enough space for the + // result of the merge in `result` starting from `offsetInResult`. + // + // 3. If `block == std::nullopt`, we are adding to `result` the located + // triples for block `blockIndex` where the `rowIndexInBlock` is + // `NO_ROW_INDEX`. These actually belong to the previous block, but were + // larger than all triples there. This requires that `id1` or both `id1` and + // `id2` are specified. + // + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult) const; + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin = 0) const; + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1, Id id2, size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; + + // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. + // Return a handle to where it was added (`LocatedTriples` is a sorted set, + // see above). We need this handle so that we can easily remove the + // `locatedTriple` again from the set in case we need to. + // + // The `locatedTriple` must not already exist in `LocatedTriplesPerBlock`. + LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { + LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; + auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); + AD_CORRECTNESS_CHECK(wasInserted == true); + AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); + ++numTriples_; + return handle; + }; + + // Get the total number of `LocatedTriple` objects (for all blocks). + size_t numTriples() const { return numTriples_; } + + // Get the number of blocks with a non-empty set of located triples. + size_t numBlocks() const { return map_.size(); } + + // Remove all located triples. + void clear() { + map_.clear(); + numTriples_ = 0; + } + + private: + // Match modes for `numTriplesInBlockImpl` and `mergeTriplesIntoBlockImpl`. + enum struct MatchMode { MatchAll, MatchId1, MatchId1AndId2 }; + + // The Implementation behind the public method `numTriplesInBlock` above. + template + std::pair numTriplesImpl(size_t blockIndex, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; + + // The Implementation behind the public method `mergeTriplesIntoBlock` above. + // The only reason that the arguments `id1` and `id2` come at the end here is + // so that we can give them default values. + template + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined(), size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; +}; + +// Human-readable representation of `LocatedTriple`, `LocatedTriples`, and +// `LocatedTriplesPerBlock`, which are very useful for debugging. +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); + +// DEFINITION OF THE POSITION OF A LOCATED TRIPLE IN A PERMUTATION +// +// 1. The position is defined by the index of a block in the permutation and the +// index of a row within that block. +// +// 2. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then the +// position is the first position in that next block. +// +// 4. As a special case of 3, if the triple is smaller than all triples in the +// permutation, the position is the first position of the first block. +// +// 5. If the triple is larger than all triples in the permutation, the block +// index is one after the largest block index and the position within that +// non-existing block is arbitrary. diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index da84f1158a..e24e33fe5c 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -1,29 +1,39 @@ -// Copyright 2018, University of Freiburg, +// Copyright 2018 - 2023, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) -// +// Authors: Johannes Kalmbach +// Hannah Bast + #pragma once #include #include -#include "../global/Id.h" -#include "../util/Exception.h" -#include "../util/HashMap.h" -#include "../util/Iterators.h" -#include "../util/Log.h" -#include "../util/Serializer/Serializer.h" -#include "./CompressedRelation.h" - -// _____________________________________________________________________ +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/Iterators.h" +#include "util/Log.h" +#include "util/Serializer/Serializer.h" + +// Class for access to relation metadata stored in a vector. Specifically, our +// index uses this with `M = MmapVector>`; see +// `index/IndexMetaData.h` template class MetaDataWrapperDense { + private: + // A vector of metadata objects. + M _vec; + public: + // An iterator with an additional method `getId()` that gives the relation ID + // of the current metadata object. template struct AddGetIdIterator : BaseIterator { using BaseIterator::BaseIterator; AddGetIdIterator(BaseIterator base) : BaseIterator{base} {} [[nodiscard]] Id getId() const { return getIdFromElement(*(*this)); } + [[nodiscard]] const auto& getMetaData() const { return *(*this); } static Id getIdFromElement(const typename BaseIterator::value_type& v) { return v.col0Id_; } @@ -39,6 +49,7 @@ class MetaDataWrapperDense { // The underlying array is sorted, so all iterators are ordered iterators using ConstOrderedIterator = ConstIterator; + // The type of the stored metadata objects. using value_type = typename M::value_type; // _________________________________________________________ @@ -109,12 +120,24 @@ class MetaDataWrapperDense { // ___________________________________________________________ std::string getFilename() const { return _vec.getFilename(); } - private: + // The following used to be private (because they were only used as + // subroutines in the above), but we now need them in + // `DeltaTriples::findTripleResult`. ConstIterator lower_bound(Id id) const { auto cmp = [](const auto& metaData, Id id) { return metaData.col0Id_ < id; }; return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); } - M _vec; + Iterator lower_bound(Id id) { + auto cmp = [](const auto& metaData, Id id) { + return metaData.col0Id_ < id; + }; + return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); + } }; + +// ======= +// M _vec; +// }; +// >>>>>>> master diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f94b54c063..c81ecaa9c5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -104,6 +104,8 @@ addLinkAndDiscoverTest(IndexMetaDataTest index) # TODO fix this addLinkAndDiscoverTestSerial(IndexTest index) +addLinkAndDiscoverTestSerial(LocatedTriplesTest index) + addLinkAndDiscoverTest(FTSAlgorithmsTest index) addLinkAndDiscoverTest(EngineTest engine) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp new file mode 100644 index 0000000000..ce4d0b909e --- /dev/null +++ b/test/LocatedTriplesTest.cpp @@ -0,0 +1,173 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./util/IdTableHelpers.h" +#include "./util/IdTestHelpers.h" +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/LocatedTriples.h" +#include "index/Permutations.h" + +// TODO: Why the namespace here? (copied from `test/IndexMetaDataTest.cpp`) +namespace { +auto V = ad_utility::testing::VocabId; +} + +// Fixture with helper functions. +class LocatedTriplesTest : public ::testing::Test { + protected: + // Make `LocatedTriplesPerBlock` from a list of `LocatedTriple` objects (the + // order in which the objects are given does not matter). + LocatedTriplesPerBlock makeLocatedTriplesPerBlock( + std::vector locatedTriples) { + LocatedTriplesPerBlock result; + for (auto locatedTriple : locatedTriples) { + result.add(locatedTriple); + } + return result; + } +}; + +// Test the method that counts the number of `LocatedTriple's in a block. +TEST_F(LocatedTriplesTest, numTriplesInBlock) { + // Set up lists of located triples for three blocks. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(10), V(1), V(0), true}, + LocatedTriple{1, 0, V(10), V(2), V(1), true}, + LocatedTriple{1, 0, V(11), V(3), V(0), false}, + LocatedTriple{2, 0, V(20), V(4), V(0), false}, + LocatedTriple{2, 0, V(21), V(5), V(0), false}, + LocatedTriple{3, 0, V(30), V(6), V(0), false}, + LocatedTriple{3, 0, V(32), V(7), V(0), true}}); + ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + + auto P = [](size_t n1, size_t n2) -> std::pair { + return {n1, n2}; + }; + + // Check the total counts per block. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), P(1, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), P(2, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), P(1, 1)); + + // Check the counts per block for a given `id1`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), P(0, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), P(0, 1)); + + // Check the counts per block for a given `id1` and `id2`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), P(0, 1)); +} + +// Test the method that merges the matching `LocatedTriple`s from a block into a +// part of an `IdTable`. +TEST_F(LocatedTriplesTest, mergeTriples) { + // A block, as it could come from an index scan. + IdTable block = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // A set of located triples for that block. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(1), V(10), V(10), true}, // Delete row 0 + LocatedTriple{1, 1, V(1), V(10), V(11), false}, // Insert before row 1 + LocatedTriple{1, 1, V(2), V(11), V(10), false}, // Insert before row 1 + LocatedTriple{1, 4, V(2), V(21), V(11), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(10), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(20), true}, // Delete row 4 + LocatedTriple{1, 5, V(3), V(30), V(30), true}}); // Delete row 5 + + // Merge all these triples into `block` and check that the result is as + // expected (four triples inserted and three triples deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 11}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}}); // Row 6 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` into `block` (three triples + // inserted and one triple deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}, // Row 6 + {30, 30}}); // Row 7 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2)); + ASSERT_EQ(result, resultExpected); + } + + // Repeat but with a partial block that leaves out the first two elements of + // `block`. + { + IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 + {20, 10}, // Row 1 + {21, 11}, // Row 2 + {30, 10}, // Row 3 + {30, 30}}); // Row 4 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2), 2); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the + // corresponding partial block (one triple inserted, one triple deleted). + { + IdTable blockColumnId3(1, ad_utility::testing::makeAllocator()); + blockColumnId3.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + blockColumnId3(i, 0) = block(i, 1); + } + IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); + IdTable result(1, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, std::move(blockColumnId3), result, 0, + V(2), V(30), 4, 6); + ASSERT_EQ(result, resultExpected); + } + + // Merge special triples. + { + size_t NRI = LocatedTriple::NO_ROW_INDEX; + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{2, NRI, V(1), V(30), V(40), true}, + LocatedTriple{2, NRI, V(1), V(30), V(50), true}, + LocatedTriple{2, NRI, V(1), V(40), V(10), true}}); + IdTable resultExpected = makeIdTableFromVector({{30, 40}, // Row 0 + {30, 50}, // Row 1 + {40, 10}}); // Row 2 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(2, std::nullopt, result, 0, V(1)); + } +}