diff --git a/.gitignore b/.gitignore index 2385f59d8f..278a771944 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,11 @@ -# Build directory +# Build directories build/ +debug/ cmake-build* +# Debugger history file +.gdb_history + # End-to-End data e2e_data/* # Compiled Object files diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 3ff82e1177..bc5a1d654b 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -1,4 +1,4 @@ -// Copyright 2011 - 2022, University of Freiburg +// Copyright 2011 - 2023, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Björn Buchhold // Johannes Kalmbach @@ -13,6 +13,7 @@ #include "engine/ExportQueryExecutionTrees.h" #include "engine/QueryPlanner.h" +#include "parser/TurtleParser.h" #include "util/BoostHelpers/AsyncWaitForFuture.h" #include "util/OnDestructionDontThrowDuringStackUnwinding.h" @@ -310,11 +311,63 @@ Awaitable Server::process( logCommand(cmd, "clear cache completely (including unpinned elements)"); cache_.clearAll(); response = createJsonResponse(composeCacheStatsJson(), request); + } else if (auto cmd = checkParameter("cmd", "clear-delta-triples")) { + logCommand(cmd, "clear delta triples"); + index_.deltaTriples().clear(); + response = createJsonResponse(composeStatsJson(), request); } else if (auto cmd = checkParameter("cmd", "get-settings")) { logCommand(cmd, "get server settings"); response = createJsonResponse(RuntimeParameters().toMap(), request); } + // Insert or delete triples. + // + // TODO: This is a preliminary interface for testing. Eventually, this should + // be included in our SPARQL grammer (where the line `updateUnit : update;` at + // the beginning is currently commented out). + // + // TODO: For testing purposes, allow insertions and deletions without access + // token. Eventually, this should be restricted, of course, which can be + // easily done by adding the argument `accessTokenOk` to each of the calls for + // `checkParameter`. + { + bool insertDetected = false; + bool deleteDetected = false; + std::optional parameterValue; + if ((parameterValue = checkParameter("insert", std::nullopt))) { + LOG(INFO) << "INSERT: " << parameterValue.value() << std::endl; + insertDetected = true; + } else if ((parameterValue = checkParameter("delete", std::nullopt))) { + LOG(INFO) << "DELETE: " << parameterValue.value() << std::endl; + deleteDetected = true; + } + if (insertDetected || deleteDetected) { + AD_CORRECTNESS_CHECK(parameterValue.has_value()); + const std::string& input = parameterValue.value(); + TurtleStringParser parser; + parser.parseUtf8String(input); + if (parser.getTriples().size() == 0) { + throw std::runtime_error("Triple could not be parsed"); + } else if (parser.getTriples().size() > 1) { + throw std::runtime_error("Only one triple per call please"); + } + TurtleTriple turtleTriple = parser.getTriples()[0]; + if (insertDetected) { + index_.deltaTriples().insertTriple(std::move(turtleTriple)); + response = + createOkResponse(absl::StrCat("INSERT operation for triple \"", + input, "\" processed\n"), + request, ad_utility::MediaType::textPlain); + } else { + index_.deltaTriples().deleteTriple(std::move(turtleTriple)); + response = + createOkResponse(absl::StrCat("DELETE operation for triple \"", + input, "\" processed\n"), + request, ad_utility::MediaType::textPlain); + } + } + } + // Ping with or without messsage. if (urlPathAndParameters._path == "/ping") { if (auto msg = checkParameter("msg", std::nullopt)) { @@ -455,6 +508,8 @@ json Server::composeStatsJson() const { result["num-text-records"] = index_.getNofTextRecords(); result["num-word-occurrences"] = index_.getNofWordPostings(); result["num-entity-occurrences"] = index_.getNofEntityPostings(); + result["num-delta-triples-inserted"] = index_.deltaTriples().numInserted(); + result["num-delta-triples-deleted"] = index_.deltaTriples().numDeleted(); return result; } diff --git a/src/engine/Server.h b/src/engine/Server.h index 5455353074..3395094a1d 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -13,6 +13,7 @@ #include "engine/QueryExecutionContext.h" #include "engine/QueryExecutionTree.h" #include "engine/SortPerformanceEstimator.h" +#include "index/DeltaTriples.h" #include "index/Index.h" #include "nlohmann/json.hpp" #include "parser/ParseException.h" diff --git a/src/global/Id.h b/src/global/Id.h index 2eaacadc4f..62a96d0fff 100644 --- a/src/global/Id.h +++ b/src/global/Id.h @@ -8,9 +8,9 @@ #include #include -#include "../util/Exception.h" -#include "./IndexTypes.h" -#include "./ValueId.h" +#include "global/IndexTypes.h" +#include "global/ValueId.h" +#include "util/Exception.h" using Id = ValueId; typedef uint16_t Score; diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h new file mode 100644 index 0000000000..0353b8c747 --- /dev/null +++ b/src/global/IdTriple.h @@ -0,0 +1,18 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include + +#include "global/Id.h" + +// Should we have an own class for this? We need this at several places. +using IdTriple = std::array; + +// Hash value for such triple. +template +H AbslHashValue(H h, const IdTriple& triple) { + return H::combine(std::move(h), triple[0], triple[1], triple[2]); +} diff --git a/src/global/ValueId.h b/src/global/ValueId.h index b51d0aef16..26ce023ab8 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -282,10 +282,10 @@ class ValueId { /// This operator is only for debugging and testing. It returns a /// human-readable representation. friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) { - ostr << toString(id.getDatatype()) << ':'; + ostr << toString(id.getDatatype())[0] << ':'; auto visitor = [&ostr](T&& value) { if constexpr (ad_utility::isSimilar) { - ostr << "Undefined"; + ostr << "xx"; } else if constexpr (ad_utility::isSimilar || ad_utility::isSimilar) { ostr << std::to_string(value); diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 4bbf53f647..4804b11ff3 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,8 @@ add_library(index VocabularyOnDisk.h VocabularyOnDisk.cpp IndexMetaData.h IndexMetaDataImpl.h MetaDataHandler.h + LocatedTriples.h LocatedTriples.cpp + DeltaTriples.h DeltaTriples.cpp StxxlSortFunctors.h TextMetaData.cpp TextMetaData.h DocsDB.cpp DocsDB.h diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index bdaa15bd72..49ecfdaa38 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -5,6 +5,7 @@ #include "CompressedRelation.h" #include "engine/idTable/IdTable.h" +#include "util/AllocatorWithLimit.h" #include "util/Cache.h" #include "util/CompressionUsingZstd/ZstdWrapper.h" #include "util/ConcurrentCache.h" @@ -15,10 +16,11 @@ using namespace std::chrono_literals; // ____________________________________________________________________________ void CompressedRelationReader::scan( - const CompressedRelationMetadata& metadata, - const vector& blockMetadata, + const CompressedRelationMetadata& metadataForRelation, + const vector& metadataForAllBlocks, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer) const { + ad_utility::SharedConcurrentTimeoutTimer timer, + const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == NumColumns); // get all the blocks where col0FirstId_ <= col0Id <= col0LastId_ @@ -26,33 +28,22 @@ void CompressedRelationReader::scan( Id col0FirstId_; Id col0LastId_; }; - Id col0Id = metadata.col0Id_; + Id col0Id = metadataForRelation.col0Id_; // TODO Use a structured binding. Structured bindings are // currently not supported by clang when using OpenMP because clang internally // transforms the `#pragma`s into lambdas, and capturing structured bindings // is only supported in clang >= 16. - decltype(blockMetadata.begin()) beginBlock, endBlock; + decltype(metadataForAllBlocks.begin()) beginBlock, endBlock; std::tie(beginBlock, endBlock) = std::equal_range( // TODO For some reason we can't use `std::ranges::equal_range`, // find out why. Note: possibly it has something to do with the limited // support of ranges in clang with versions < 16. Revisit this when // we use clang 16. - blockMetadata.begin(), blockMetadata.end(), KeyLhs{col0Id, col0Id}, - [](const auto& a, const auto& b) { + metadataForAllBlocks.begin(), metadataForAllBlocks.end(), + KeyLhs{col0Id, col0Id}, [](const auto& a, const auto& b) { return a.col0FirstId_ < b.col0FirstId_ && a.col0LastId_ < b.col0LastId_; }); - // The total size of the result is now known. - result->resize(metadata.getNofElements()); - - // The position in the result to which the next block is being - // decompressed. - size_t rowIndexOfNextBlock = 0; - - // The number of rows for which we still have space - // in the result (only needed for checking of invariants). - size_t spaceLeft = result->size(); - // The first block might contain entries that are not part of our // actual scan result. bool firstBlockIsIncomplete = @@ -69,94 +60,186 @@ void CompressedRelationReader::scan( AD_CORRECTNESS_CHECK(!firstBlockIsIncomplete || (beginBlock == lastBlock)); AD_CORRECTNESS_CHECK(!lastBlockIsIncomplete); if (firstBlockIsIncomplete) { - AD_CORRECTNESS_CHECK(metadata.offsetInBlock_ != + AD_CORRECTNESS_CHECK(metadataForRelation.offsetInBlock_ != std::numeric_limits::max()); } + // Compute the numer of inserted and deleted triples per block and overall. + // note the `<=` so that we don't forget the block beyond the last (which may + // have information about delta triples at the vey end of a relation). + std::vector> numInsAndDelPerBlock; + size_t numInsTotal = 0; + size_t numDelTotal = 0; + for (auto block = beginBlock; block <= endBlock; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto [numIns, numDel] = + block == beginBlock || block == endBlock + ? locatedTriplesPerBlock.numTriples(blockIndex, col0Id) + : locatedTriplesPerBlock.numTriples(blockIndex); + numInsTotal += numIns; + numDelTotal += numDel; + numInsAndDelPerBlock.push_back({numIns, numDel}); + } + if (numInsTotal > 0 || numDelTotal > 0) { + LOG(INFO) << "Index scan with delta triples: #inserts = " << numInsTotal + << ", #deletes = " << numDelTotal + << ", #blocks = " << (endBlock - beginBlock) << std::endl; + AD_CORRECTNESS_CHECK(numDelTotal < metadataForRelation.getNofElements()); + } + + // The total size of the result is now known. + result->resize(metadataForRelation.getNofElements() + numInsTotal - + numDelTotal); + + // The position in the result to which the next block is being + // decompressed. + size_t offsetInResult = 0; + + // The number of rows for which we still have space + // in the result (only needed for checking of invariants). + size_t spaceLeft = result->size(); + // We have at most one block that is incomplete and thus requires trimming. // Set up a lambda, that reads this block and decompresses it to // the result. - auto readIncompleteBlock = [&](const auto& block) { + auto processIncompleteBlock = [&](const auto& blockMetadata) { // A block is uniquely identified by its start position in the file. - auto cacheKey = block.offsetsAndCompressedSize_.at(0).offsetInFile_; - auto uncompressedBuffer = blockCache_ - .computeOnce(cacheKey, - [&]() { - return readAndDecompressBlock( - block, file, std::nullopt); - }) - ._resultPointer; - - // Extract the part of the block that actually belongs to the relation - auto numElements = metadata.numRows_; - AD_CORRECTNESS_CHECK(uncompressedBuffer->numColumns() == - metadata.numColumns()); - for (size_t i = 0; i < uncompressedBuffer->numColumns(); ++i) { - const auto& inputCol = uncompressedBuffer->getColumn(i); - auto begin = inputCol.begin() + metadata.offsetInBlock_; - auto resultColumn = result->getColumn(i); - AD_CORRECTNESS_CHECK(numElements <= spaceLeft); - std::copy(begin, begin + numElements, resultColumn.begin()); + // + // NOTE: We read these blocks via a cache in order to speed up the unit + // tests (which make many requests to the same block, so we don't want + // to decompress it again and again). + auto cacheKey = + blockMetadata->offsetsAndCompressedSize_.at(0).offsetInFile_; + auto block = blockCache_ + .computeOnce(cacheKey, + [&]() { + return readAndDecompressBlock( + *blockMetadata, file, std::nullopt); + }) + ._resultPointer; + + // Determine (via the metadata for the relation), exactly which part of the + // block belongs to the relation. + auto numInsAndDel = numInsAndDelPerBlock.at(blockMetadata - beginBlock); + size_t rowIndexBegin = metadataForRelation.offsetInBlock_; + size_t rowIndexEnd = rowIndexBegin + metadataForRelation.numRows_; + AD_CORRECTNESS_CHECK(rowIndexBegin < block->size()); + AD_CORRECTNESS_CHECK(rowIndexEnd <= block->size()); + AD_CORRECTNESS_CHECK(block->numColumns() == + metadataForRelation.numColumns()); + size_t numRowsWrittenToResult = rowIndexEnd - rowIndexBegin; + + // Without delta triples, just copy the part of the block to `result`. + // Otherwise use `mergeTriples`. + if (numInsAndDel == std::pair{0, 0}) { + for (size_t i = 0; i < numRowsWrittenToResult; ++i) { + (*result)(offsetInResult + i, 0) = (*block)(rowIndexBegin + i, 0); + (*result)(offsetInResult + i, 1) = (*block)(rowIndexBegin + i, 1); + } + } else { + // TODO: First copy `*block` to an object of class `IdTable`. This copy + // would be avoidable, see the related comment in `getBlockPart` in the + // other `CompressedRelationReader::scan` below. + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable blockAsIdTable(2, allocator); + blockAsIdTable.resize(block->size()); + for (size_t i = 0; i < block->size(); ++i) { + blockAsIdTable(i, 0) = (*block)(i, 0); + blockAsIdTable(i, 1) = (*block)(i, 1); + } + // Now call `mergeTriples` on `blockAsIdTable`. + size_t blockIndex = blockMetadata - metadataForAllBlocks.begin(); + size_t numRowsWrittenExpected = numRowsWrittenToResult; + numRowsWrittenExpected += numInsAndDel.first - numInsAndDel.second; + numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::move(blockAsIdTable), *result, offsetInResult, + col0Id, rowIndexBegin); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numRowsWrittenExpected); } - rowIndexOfNextBlock += numElements; - spaceLeft -= numElements; + + AD_CORRECTNESS_CHECK(numRowsWrittenToResult <= spaceLeft); + offsetInResult += numRowsWrittenToResult; + spaceLeft -= numRowsWrittenToResult; }; // Read the first block if it is incomplete + auto completeBlocksBegin = beginBlock; + auto completeBlocksEnd = endBlock; if (firstBlockIsIncomplete) { - readIncompleteBlock(*beginBlock); - ++beginBlock; + processIncompleteBlock(beginBlock); + ++completeBlocksBegin; if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan :"); } } - // Read all the other (complete!) blocks in parallel - if (beginBlock < endBlock) { + // Process all the other (complete) blocks. The compressed blocks are + // read sequentially from disk and then decompressed in parallel. + if (completeBlocksBegin < completeBlocksEnd) { #pragma omp parallel #pragma omp single - { - for (; beginBlock < endBlock; ++beginBlock) { - const auto& block = *beginBlock; - // Read a block from disk (serially). - - CompressedBlock compressedBuffer = - readCompressedBlockFromFile(block, file, std::nullopt); - - // This lambda decompresses the block that was just read to the - // correct position in the result. - auto decompressLambda = [&result, rowIndexOfNextBlock, &block, - compressedBuffer = - std::move(compressedBuffer)]() { - ad_utility::TimeBlockAndLog tbl{"Decompressing a block"}; - - decompressBlockToExistingIdTable(compressedBuffer, block.numRows_, - *result, rowIndexOfNextBlock); - }; + for (auto block = completeBlocksBegin; block < completeBlocksEnd; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(block - beginBlock); - // The `decompressLambda` can now run in parallel -#pragma omp task - { - if (!timer || !timer->wlock()->hasTimedOut()) { - decompressLambda(); - }; - } + // Read the compressed block from disk (both columns). + CompressedBlock compressedBuffer = + readCompressedBlockFromFile(*block, file, std::nullopt); + + // This lambda decompresses the block that was just read to the + // correct position in the result. + auto decompressLambda = [&result, &locatedTriplesPerBlock, &block, + numInsAndDel, offsetInResult, blockIndex, + compressedBuffer = + std::move(compressedBuffer)]() { + ad_utility::TimeBlockAndLog tbl{"Decompressing a block"}; - // this is again serial code, set up the correct pointers - // for the next block; - spaceLeft -= block.numRows_; - rowIndexOfNextBlock += block.numRows_; + decompressBlockToExistingIdTable(compressedBuffer, block->numRows_, + *result, offsetInResult, numInsAndDel, + locatedTriplesPerBlock, blockIndex); + }; + + // This `decompressLambda` can run concurrently. +#pragma omp task + { + if (!timer || !timer->wlock()->hasTimedOut()) { + decompressLambda(); + }; } - AD_CORRECTNESS_CHECK(spaceLeft == 0); - } // End of omp parallel region, all the decompression was handled now. + + // Update the counters. + AD_CORRECTNESS_CHECK(numInsAndDel.second <= block->numRows_); + size_t numRowsOfThisBlock = + block->numRows_ + numInsAndDel.first - numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); + spaceLeft -= numRowsOfThisBlock; + offsetInResult += numRowsOfThisBlock; + } + // End of omp parallel region, all blocks are decompressed now. } + + // Add delta triples from beyond last block, if any. + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() > 0); + auto numInsBeyondLastBlock = numInsAndDelPerBlock.back().first; + if (numInsBeyondLastBlock > 0) { + size_t blockIndex = endBlock - metadataForAllBlocks.begin(); + size_t numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::nullopt, *result, offsetInResult, col0Id); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numInsBeyondLastBlock); + spaceLeft -= numRowsWrittenToResult; + } + AD_CORRECTNESS_CHECK(spaceLeft == 0); } // _____________________________________________________________________________ void CompressedRelationReader::scan( - const CompressedRelationMetadata& metaData, Id col1Id, - const vector& blocks, ad_utility::File& file, - IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer) const { + const CompressedRelationMetadata& metadataForRelation, Id col1Id, + const vector& metadataForAllBlocks, + ad_utility::File& file, IdTable* result, + ad_utility::SharedConcurrentTimeoutTimer timer, + const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == 1); // Get all the blocks that possibly might contain our pair of col0Id and @@ -175,114 +258,223 @@ void CompressedRelationReader::scan( return endBeforeBegin; }; - Id col0Id = metaData.col0Id_; + Id col0Id = metadataForRelation.col0Id_; // Note: See the comment in the other overload for `scan` above for the // reason why we (currently) can't use a structured binding here. - decltype(blocks.begin()) beginBlock, endBlock; + decltype(metadataForAllBlocks.begin()) beginBlock, endBlock; std::tie(beginBlock, endBlock) = - std::equal_range(blocks.begin(), blocks.end(), + std::equal_range(metadataForAllBlocks.begin(), metadataForAllBlocks.end(), KeyLhs{col0Id, col0Id, col1Id, col1Id}, comp); + // Compute the number of inserted and deleted triples per block and overall. + // note the `<=` so that we don't forget the block beyond the last (which + // may have information about delta triples at the vey end of a relation). + std::vector> numInsAndDelPerBlock; + size_t numInsTotal = 0; + size_t numDelTotal = 0; + for (auto block = beginBlock; block <= endBlock; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto [numIns, numDel] = + block == beginBlock || block == endBlock - 1 || block == endBlock + ? locatedTriplesPerBlock.numTriples(blockIndex, col0Id, col1Id) + : locatedTriplesPerBlock.numTriples(blockIndex); + numInsTotal += numIns; + numDelTotal += numDel; + numInsAndDelPerBlock.push_back({numIns, numDel}); + } + if (numInsTotal > 0 || numDelTotal > 0) { + LOG(INFO) << "Index scan with delta triples: #inserts = " << numInsTotal + << ", #deletes = " << numDelTotal + << ", #blocks = " << (endBlock - beginBlock) << std::endl; + AD_CORRECTNESS_CHECK(numDelTotal < metadataForRelation.getNofElements()); + } + // Invariant: The col0Id is completely stored in a single block, or it is // contained in multiple blocks that only contain this col0Id, - bool col0IdHasExclusiveBlocks = - metaData.offsetInBlock_ == std::numeric_limits::max(); + bool col0IdHasExclusiveBlocks = metadataForRelation.offsetInBlock_ == + std::numeric_limits::max(); if (!col0IdHasExclusiveBlocks) { // This might also be zero if no block was found at all. AD_CORRECTNESS_CHECK(endBlock - beginBlock <= 1); } - // The first and the last block might be incomplete (that is, only - // a part of these blocks is actually part of the result, - // set up a lambda which allows us to read these blocks, and returns - // the result as a vector. - auto readPossiblyIncompleteBlock = [&](const auto& block) { - DecompressedBlock uncompressedBuffer = - readAndDecompressBlock(block, file, std::nullopt); - AD_CORRECTNESS_CHECK(uncompressedBuffer.numColumns() == 2); - const auto& col1Column = uncompressedBuffer.getColumn(0); - const auto& col2Column = uncompressedBuffer.getColumn(1); - AD_CORRECTNESS_CHECK(col1Column.size() == col2Column.size()); - - // Find the range in the block, that belongs to the same relation `col0Id` - bool containedInOnlyOneBlock = - metaData.offsetInBlock_ != std::numeric_limits::max(); - auto begin = col1Column.begin(); - if (containedInOnlyOneBlock) { - begin += metaData.offsetInBlock_; - } - auto end = - containedInOnlyOneBlock ? begin + metaData.numRows_ : col1Column.end(); - - // Find the range in the block, where also the col1Id matches (the second - // ID in the `std::array` does not matter). - std::tie(begin, end) = std::equal_range(begin, end, col1Id); + // Helper class for a part of a block (needed for the first and last block + // in the following). These are small objects, so an unlimited allocator is + // OK. + struct BlockPart { + std::unique_ptr idTable = nullptr; + size_t rowIndexBegin = 0; + size_t rowIndexEnd = 0; + size_t blockIndex = 0; + std::pair numInsAndDel; + size_t size() const { return rowIndexEnd - rowIndexBegin; } + }; - size_t beginIndex = begin - col1Column.begin(); - size_t endIndex = end - col1Column.begin(); + // Helper lambda that extracts the relevant `Id`s from the given + // `blockMetadata` iterator. Returns the corresponding part and its (row + // index) begin and end index in the original block. + // + // NOTE: This is used for the first and last block below because these may + // contain triples that do not match `col0Id` and `col1Id`. We cannot + // directly merge these into `result` because we first need to know its + // total size and resize it before we can write to it. + auto getBlockPart = [&](auto blockMetadata) -> BlockPart { + DecompressedBlock block = + readAndDecompressBlock(*blockMetadata, file, std::nullopt); + + // First find the range with matching `col0Id`. The `if` condition asks if + // the relation is contained in a single block (this one). + auto blockPartBegin = block.begin(); + auto blockPartEnd = block.end(); + if (metadataForRelation.offsetInBlock_ != + std::numeric_limits::max()) { + blockPartBegin += metadataForRelation.offsetInBlock_; + blockPartEnd = blockPartBegin + metadataForRelation.numRows_; + AD_CORRECTNESS_CHECK(blockPartBegin < block.end()); + AD_CORRECTNESS_CHECK(blockPartEnd <= block.end()); + } - // Only extract the relevant portion of the second column. - std::vector result(col2Column.begin() + beginIndex, - col2Column.begin() + endIndex); - return result; + // Within that range find the subrange, where also `col1Id` matches. + std::tie(blockPartBegin, blockPartEnd) = std::equal_range( + blockPartBegin, blockPartEnd, std::array{col1Id}, + [](const auto& x, const auto& y) { return x[0] < y[0]; }); + // std::cout << "Block part: "; + // std::transform(blockPartBegin, blockPartEnd, + // std::ostream_iterator(std::cout, " "), + // [](const auto& row) { + // return absl::StrCat("{", row[0], " ", row[1], "}"); + // }); + // std::cout << std::endl; + + // Variables for the index of this block and the range. + // + // TODO: `IndexTest.scanTest` failes if we check `rowIndexEnd > + // rowIndexBegin` instead of just `>='. Can this really happen? + size_t rowIndexBegin = blockPartBegin - block.begin(); + size_t rowIndexEnd = blockPartEnd - block.begin(); + AD_CORRECTNESS_CHECK(rowIndexBegin < block.size()); + AD_CORRECTNESS_CHECK(rowIndexEnd <= block.size()); + AD_CORRECTNESS_CHECK(rowIndexEnd >= rowIndexBegin); + size_t blockIndex = blockMetadata - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(blockMetadata - beginBlock); + + // Copy `block` to an `IdTable`. + // + // TODO: This is an unecessary copy. Extend the `IdTable` class so that we + // can move the data from the second column of `block` to + // `blockAsIdTable`. + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable result(1, allocator); + result.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + result(i, 0) = block(i, 1); + } + return {std::make_unique(std::move(result)), rowIndexBegin, + rowIndexEnd, blockIndex, numInsAndDel}; }; - // The first and the last block might be incomplete, compute - // and store the partial results from them. - std::vector firstBlockResult, lastBlockResult; + // The first and the last block might be incomplete. We process them + // separately from the complete blocks inbetween. + BlockPart firstBlockPart; + BlockPart lastBlockPart; + auto completeBlocksBegin = beginBlock; + auto completeBlocksEnd = endBlock; if (beginBlock < endBlock) { - firstBlockResult = readPossiblyIncompleteBlock(*beginBlock); - ++beginBlock; + firstBlockPart = getBlockPart(beginBlock); + ++completeBlocksBegin; if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan: "); } } - if (beginBlock < endBlock) { - lastBlockResult = readPossiblyIncompleteBlock(*(endBlock - 1)); - endBlock--; + if (completeBlocksBegin < endBlock) { + --completeBlocksEnd; + lastBlockPart = getBlockPart(completeBlocksEnd); if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan: "); } } - // Determine the total size of the result. - // First accumulate the complete blocks in the "middle" - auto totalResultSize = std::accumulate( - beginBlock, endBlock, 0ul, [](const auto& count, const auto& block) { - return count + block.numRows_; - }); - // Add the possibly incomplete blocks from the beginning and end; - totalResultSize += firstBlockResult.size() + lastBlockResult.size(); - + // The total result size is the size of complete blocks plus the size of the + // possibly incomplete blocks at the beginning and end, plus the number of + // inserted triples minus the number of deleted triples. + auto totalResultSize = + std::accumulate(completeBlocksBegin, completeBlocksEnd, 0ul, + [](const auto& count, const auto& block) { + return count + block.numRows_; + }); + totalResultSize += firstBlockPart.size() + lastBlockPart.size(); + AD_CORRECTNESS_CHECK(numDelTotal <= totalResultSize); + totalResultSize += numInsTotal - numDelTotal; result->resize(totalResultSize); + size_t spaceLeft = result->size(); + size_t offsetInResult = 0; + + // Helper lambda for processing the first or last block. + // + // NOTE: This should only be called once for a given `BlockPart` because the + // `idTable` is moved away from it. + auto processBlockPart = [&](BlockPart& blockPart) { + if (blockPart.idTable) { + size_t numRowsWrittenToResult = 0; + // If there are no delta triples, copy directly to the result, otherwise + // use (the slightly more expensive) `mergeTriples`. + if (blockPart.numInsAndDel == std::pair{0, 0}) { + for (size_t i = 0; i < blockPart.size(); ++i) { + (*result)(offsetInResult + i, 0) = + (*blockPart.idTable)(blockPart.rowIndexBegin + i, 0); + } + numRowsWrittenToResult = blockPart.size(); + } else { + numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockPart.blockIndex, std::move(*(blockPart.idTable)), *result, + offsetInResult, col0Id, col1Id, blockPart.rowIndexBegin, + blockPart.rowIndexEnd); + } + // Check that `numRowsWrittenToResult` is as expected. + { + size_t expected = blockPart.size(); + AD_CORRECTNESS_CHECK(blockPart.numInsAndDel.second <= expected); + expected += blockPart.numInsAndDel.first; + expected -= blockPart.numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == expected); + } + AD_CORRECTNESS_CHECK(numRowsWrittenToResult <= spaceLeft); + offsetInResult += numRowsWrittenToResult; + spaceLeft -= numRowsWrittenToResult; + } + }; - // Insert the first block into the result; - std::copy(firstBlockResult.begin(), firstBlockResult.end(), - result->getColumn(0).data()); - size_t rowIndexOfNextBlockStart = firstBlockResult.size(); - - // Insert the complete blocks from the middle in parallel - if (beginBlock < endBlock) { + // Process the first block part, then all the complete blocks, then the last + // block part. The complete blocks are read sequentially from disk and then + // (after we know their position in `result`) decompressed and merged into + // `result` in parallel. + processBlockPart(firstBlockPart); + if (completeBlocksBegin < completeBlocksEnd) { #pragma omp parallel #pragma omp single - for (; beginBlock < endBlock; ++beginBlock) { - const auto& block = *beginBlock; + for (auto block = completeBlocksBegin; block < completeBlocksEnd; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(block - beginBlock); - // Read the block serially, only read the second column. - AD_CORRECTNESS_CHECK(block.offsetsAndCompressedSize_.size() == 2); + // Read the compressed block from disk (second column only). + AD_CORRECTNESS_CHECK(block->offsetsAndCompressedSize_.size() == 2); CompressedBlock compressedBuffer = - readCompressedBlockFromFile(block, file, std::vector{1ul}); + readCompressedBlockFromFile(*block, file, std::vector{1ul}); // A lambda that owns the compressed block decompresses it to the // correct position in the result. It may safely be run in parallel - auto decompressLambda = [rowIndexOfNextBlockStart, &block, result, + auto decompressLambda = [&result, &block, &locatedTriplesPerBlock, + offsetInResult, numInsAndDel, blockIndex, compressedBuffer = std::move(compressedBuffer)]() mutable { ad_utility::TimeBlockAndLog tbl{"Decompression a block"}; - decompressBlockToExistingIdTable(compressedBuffer, block.numRows_, - *result, rowIndexOfNextBlockStart); + decompressBlockToExistingIdTable(compressedBuffer, block->numRows_, + *result, offsetInResult, numInsAndDel, + locatedTriplesPerBlock, blockIndex); }; // Register an OpenMP task that performs the decompression of this @@ -294,15 +486,29 @@ void CompressedRelationReader::scan( } } - // update the pointers - rowIndexOfNextBlockStart += block.numRows_; - } // end of parallel region + // Update the counters. + AD_CORRECTNESS_CHECK(numInsAndDel.second <= block->numRows_); + size_t numRowsOfThisBlock = + block->numRows_ + numInsAndDel.first - numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); + spaceLeft -= numRowsOfThisBlock; + offsetInResult += numRowsOfThisBlock; + } + // End of omp parallel region, all blocks are decompressed now. } - // Add the last block. - std::copy(lastBlockResult.begin(), lastBlockResult.end(), - result->getColumn(0).data() + rowIndexOfNextBlockStart); - AD_CORRECTNESS_CHECK(rowIndexOfNextBlockStart + lastBlockResult.size() == - result->size()); + processBlockPart(lastBlockPart); + + // Add delta triples from beyond last block, if any. + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() > 0); + auto numInsBeyondLastBlock = numInsAndDelPerBlock.back().first; + if (numInsBeyondLastBlock > 0) { + size_t blockIndex = endBlock - metadataForAllBlocks.begin(); + size_t numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::nullopt, *result, offsetInResult, col0Id, col1Id); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numInsBeyondLastBlock); + spaceLeft -= numRowsWrittenToResult; + } + AD_CORRECTNESS_CHECK(spaceLeft == 0); } // _____________________________________________________________________________ @@ -312,8 +518,8 @@ float CompressedRelationWriter::computeMultiplicity( float multiplicity = functional ? 1.0f : static_cast(numElements) / float(numDistinctElements); - // Ensure that the multiplicity is only exactly 1.0 if the relation is indeed - // functional to prevent numerical instabilities; + // Ensure that the multiplicity is only exactly 1.0 if the relation is + // indeed functional to prevent numerical instabilities; if (!functional && multiplicity == 1.0f) [[unlikely]] { multiplicity = std::nextafter(1.0f, 2.0f); } @@ -472,14 +678,57 @@ DecompressedBlock CompressedRelationReader::decompressBlock( // ____________________________________________________________________________ void CompressedRelationReader::decompressBlockToExistingIdTable( const CompressedBlock& compressedBlock, size_t numRowsToRead, - IdTable& table, size_t offsetInTable) { - AD_CORRECTNESS_CHECK(table.numRows() >= offsetInTable + numRowsToRead); + IdTable& result, size_t offsetInResult, + std::pair numInsAndDel, + const LocatedTriplesPerBlock& locatedTriplesPerBlock, size_t blockIndex) { + // Check that the given arguments are consistent (they should always be, + // given that this method is `private`). LOG(INFO) << "numRowsToRead: " << + // numRowsToRead << std::endl; LOG(INFO) << "numInsAndDel.first: " << + // numInsAndDel.first << std::endl; LOG(INFO) << "numInsAndDel.second: " << + // numInsAndDel.second << std::endl; + AD_CORRECTNESS_CHECK(numInsAndDel.second <= numRowsToRead); + AD_CORRECTNESS_CHECK(result.numRows() + numInsAndDel.second >= + offsetInResult + numRowsToRead + numInsAndDel.first); + AD_CORRECTNESS_CHECK(compressedBlock.size() == result.numColumns()); + + // Helper lambda that decompresses `numRowsToRead` from `compressedBlock` + // to the given `IdTable` iterator. + // + // TODO: It would be more natural to pass an `IdTable::iterator` here, but + // it seems that we can't get from that an iterator into an `IdTable` + // column. + // // TODO use zip_view. - AD_CORRECTNESS_CHECK(compressedBlock.size() == table.numColumns()); - for (size_t i = 0; i < compressedBlock.size(); ++i) { - auto col = table.getColumn(i); - decompressColumn(compressedBlock[i], numRowsToRead, - col.data() + offsetInTable); + auto decompressToIdTable = [&compressedBlock, &numRowsToRead]( + IdTable& idTable, size_t offsetInIdTable) { + size_t numColumns = compressedBlock.size(); + for (size_t i = 0; i < numColumns; ++i) { + const auto& columnFromBlock = compressedBlock[i]; + auto columnFromIdTable = idTable.getColumn(i); + decompressColumn(columnFromBlock, numRowsToRead, + columnFromIdTable.data() + offsetInIdTable); + } + }; + + // If there are no delta triples for this block, just decompress directly to + // the `result` table. Otherwise decompress to an intermediate table and + // merge from there to `result`. + // + // TODO: In the second case, we use an unlimited allocator for the space + // allocation for the intermediate table. This looks OK because our blocks + // are small, but it might be better to allocate also this table from the + // memory pool available to the server (to which we don't have acces here). + if (numInsAndDel == std::pair{0, 0}) { + decompressToIdTable(result, offsetInResult); + } else { + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable blockAsIdTable(compressedBlock.size(), allocator); + blockAsIdTable.resize(numRowsToRead); + decompressToIdTable(blockAsIdTable, 0); + locatedTriplesPerBlock.mergeTriples(blockIndex, std::move(blockAsIdTable), + result, offsetInResult); } } diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 3c6c5df80a..c2275ccc56 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -2,8 +2,7 @@ // Chair of Algorithms and Data Structures // Author: Johannes Kalmbach -#ifndef QLEVER_COMPRESSEDRELATION_H -#define QLEVER_COMPRESSEDRELATION_H +#pragma once #include #include @@ -11,6 +10,7 @@ #include "engine/idTable/IdTable.h" #include "global/Id.h" #include "index/ConstantsIndexBuilding.h" +#include "index/LocatedTriples.h" #include "util/BufferedVector.h" #include "util/Cache.h" #include "util/ConcurrentCache.h" @@ -71,8 +71,14 @@ struct CompressedBlockMetadata { Id col1FirstId_; Id col1LastId_; - // For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we - // need to know the least significant `Id` of the last triple as well. + // For `DeltaTriples::findTripleInPermutation`, it helps to know the least + // significant ID of the last triple as well. + // + // NOTE: We don't need that information for the first triple of the block and + // as a matter of fact, we don't really need `_col0FirstId` and `_col1FirstId` + // above either. It doesn't really harm though because the total size of the + // blocks is small (even for Wikidata, we have only 50K block, and as you can + // see from the members, a block consumes < 100 bytes). Id col2LastId_; // Two of these are equal if all members are equal. @@ -252,7 +258,9 @@ class CompressedRelationReader { void scan(const CompressedRelationMetadata& metadata, const vector& blockMetadata, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer) const; + ad_utility::SharedConcurrentTimeoutTimer timer, + const LocatedTriplesPerBlock& locatedTriplesPerBlock = + LocatedTriplesPerBlock{}) const; /** * @brief For a permutation XYZ, retrieve all Z for given X and Y. @@ -272,7 +280,9 @@ class CompressedRelationReader { void scan(const CompressedRelationMetadata& metaData, Id col1Id, const vector& blocks, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const; + ad_utility::SharedConcurrentTimeoutTimer timer, + const LocatedTriplesPerBlock& locatedTriplesPerBlock = + LocatedTriplesPerBlock{}) const; private: // Read the block that is identified by the `blockMetaData` from the `file`. @@ -295,7 +305,10 @@ class CompressedRelationReader { // must have at least `numRowsToRead + offsetInTable` rows. static void decompressBlockToExistingIdTable( const CompressedBlock& compressedBlock, size_t numRowsToRead, - IdTable& table, size_t offsetInTable); + IdTable& table, size_t offsetInTable, + std::pair numInsAndDel = {0, 0}, + const LocatedTriplesPerBlock& locatedTriplesPerBlock = {}, + size_t blockIndex = 0); // Helper function used by `decompressBlock` and // `decompressBlockToExistingIdTable`. Decompress the `compressedColumn` and @@ -305,6 +318,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, @@ -313,5 +327,3 @@ class CompressedRelationReader { const CompressedBlockMetadata& blockMetaData, ad_utility::File& file, std::optional> columnIndices); }; - -#endif // QLEVER_COMPRESSEDRELATION_H diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 71081578f2..22c2e9b1d1 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -84,7 +84,7 @@ constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; // NOTE: This used to be `1 << 23` (over 8M), which is fairly large (we always // need to decompress at least one whole block, even when reading only few // triples). With 100K, the total space for all the `CompressedBlockMetadata` is -// still small compared to the rest of the index. However, with 100K, and single -// block is just 10K compresse, which might result in sub-optimal IO-efficiency +// still small compared to the rest of the index. However, with 100K, a single +// block is just 10K compressed, which might result in sub-optimal IO-efficiency // when reading many blocks. We take 500K as a compromise. constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 500'000; diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp new file mode 100644 index 0000000000..35da7f88dd --- /dev/null +++ b/src/index/DeltaTriples.cpp @@ -0,0 +1,172 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/DeltaTriples.h" + +#include "absl/strings/str_cat.h" +#include "index/Index.h" +#include "index/IndexImpl.h" +#include "index/LocatedTriples.h" +#include "parser/TurtleParser.h" +#include "util/Timer.h" + +// ____________________________________________________________________________ +void DeltaTriples::clear() { + triplesInserted_.clear(); + triplesDeleted_.clear(); + locatedTriplesPerBlockInPSO_.clear(); + locatedTriplesPerBlockInPOS_.clear(); + locatedTriplesPerBlockInSPO_.clear(); + locatedTriplesPerBlockInSOP_.clear(); + locatedTriplesPerBlockInOSP_.clear(); + locatedTriplesPerBlockInOPS_.clear(); +} + +// ____________________________________________________________________________ +DeltaTriples::LocatedTripleHandles DeltaTriples::locateTripleInAllPermutations( + const IdTriple& idTriple) { + auto [s, p, o] = idTriple; + LocatedTripleHandles handles; + handles.forPSO = + locatedTriplesPerBlockInPSO_.add(LocatedTriple::locateTripleInPermutation( + p, s, o, index_.getImpl().PSO())); + handles.forPOS = + locatedTriplesPerBlockInPOS_.add(LocatedTriple::locateTripleInPermutation( + p, o, s, index_.getImpl().POS())); + handles.forSPO = + locatedTriplesPerBlockInSPO_.add(LocatedTriple::locateTripleInPermutation( + s, p, o, index_.getImpl().SPO())); + handles.forSOP = + locatedTriplesPerBlockInSOP_.add(LocatedTriple::locateTripleInPermutation( + s, o, p, index_.getImpl().SOP())); + handles.forOSP = + locatedTriplesPerBlockInOSP_.add(LocatedTriple::locateTripleInPermutation( + o, s, p, index_.getImpl().OSP())); + handles.forOPS = + locatedTriplesPerBlockInOPS_.add(LocatedTriple::locateTripleInPermutation( + o, p, s, index_.getImpl().OPS())); + return handles; +} + +// ____________________________________________________________________________ +void DeltaTriples::eraseTripleInAllPermutations( + DeltaTriples::LocatedTripleHandles& handles) { + // Helper lambda for erasing for one particular permutation. + auto erase = [](LocatedTriples::iterator locatedTriple, + LocatedTriplesPerBlock& locatedTriplesPerBlock) { + size_t blockIndex = locatedTriple->blockIndex; + locatedTriplesPerBlock.map_[blockIndex].erase(locatedTriple); + }; + // Now erase for all permutations. + erase(handles.forPSO, locatedTriplesPerBlockInPSO_); + erase(handles.forPOS, locatedTriplesPerBlockInPOS_); + erase(handles.forSPO, locatedTriplesPerBlockInSPO_); + erase(handles.forSOP, locatedTriplesPerBlockInSOP_); + erase(handles.forOSP, locatedTriplesPerBlockInOSP_); + erase(handles.forOPS, locatedTriplesPerBlockInOPS_); +}; + +// ____________________________________________________________________________ +void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { + IdTriple idTriple = getIdTriple(turtleTriple); + // Inserting a triple (that does not exist in the original index) a second + // time has no effect. + // + // TODO: Test this behavior. + if (triplesInserted_.contains(idTriple)) { + throw std::runtime_error(absl::StrCat( + "Triple \"", turtleTriple.toString(), "\" was already inserted before", + ", this insertion therefore has no effect")); + } + // When re-inserting a previously deleted triple, we need to remove the triple + // from `triplesDeleted_` AND remove it from all + // `locatedTriplesPerBlock` (one per permutation) as well. + if (triplesDeleted_.contains(idTriple)) { + eraseTripleInAllPermutations(triplesDeleted_.at(idTriple)); + triplesDeleted_.erase(idTriple); + return; + } + // Locate the triple in one of the permutations (it does not matter which one) + // to check if it already exists in the index. If it already exists, the + // insertion is invalid, otherwise insert it. + LocatedTriple locatedTriple = LocatedTriple::locateTripleInPermutation( + idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); + if (locatedTriple.existsInIndex) { + throw std::runtime_error( + absl::StrCat("Triple \"", turtleTriple.toString(), + "\" already exists in the original index", + ", this insertion therefore has no effect")); + } + auto iterators = locateTripleInAllPermutations(idTriple); + triplesInserted_.insert({idTriple, iterators}); +} + +// ____________________________________________________________________________ +void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { + IdTriple idTriple = getIdTriple(turtleTriple); + // Deleting a triple (that does exist in the original index) a second time has + // no effect. + // + // TODO: Test this behavior. + if (triplesDeleted_.contains(idTriple)) { + throw std::runtime_error(absl::StrCat( + "Triple \"", turtleTriple.toString(), "\" was already deleted before", + ", this deletion therefore has no effect")); + } + // When deleting a previously inserted triple (that did not exist in the index + // before), we need to remove the triple from `triplesInserted_` AND remove it + // from all `locatedTriplesPerBlock` (one per permutation) as well. + if (triplesInserted_.contains(idTriple)) { + eraseTripleInAllPermutations(triplesInserted_.at(idTriple)); + triplesInserted_.erase(idTriple); + return; + } + // Locate the triple in one of the permutations (it does not matter which one) + // to check if it actually exists in the index. If it does not exist, the + // deletion is invalid, otherwise add as deleted triple. + LocatedTriple locatedTriple = LocatedTriple::locateTripleInPermutation( + idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); + if (!locatedTriple.existsInIndex) { + throw std::runtime_error( + absl::StrCat("Triple \"", turtleTriple.toString(), + "\" does not exist in the original index", + ", the deletion has no effect")); + } + auto iterators = locateTripleInAllPermutations(idTriple); + triplesDeleted_.insert({idTriple, iterators}); +} + +// ____________________________________________________________________________ +const LocatedTriplesPerBlock& DeltaTriples::getTriplesWithPositionsPerBlock( + Permutation::Enum permutation) const { + // TODO: This `switch` would no longer be needed if the six + // locatedTriplesPerBlockIn... were a map with the permutation as key. + switch (permutation) { + case Permutation::PSO: + return locatedTriplesPerBlockInPSO_; + case Permutation::POS: + return locatedTriplesPerBlockInPOS_; + case Permutation::SPO: + return locatedTriplesPerBlockInSPO_; + case Permutation::SOP: + return locatedTriplesPerBlockInSOP_; + case Permutation::OSP: + return locatedTriplesPerBlockInOSP_; + case Permutation::OPS: + return locatedTriplesPerBlockInOPS_; + default: + AD_FAIL(); + } +} + +// ____________________________________________________________________________ +IdTriple DeltaTriples::getIdTriple(const TurtleTriple& turtleTriple) { + TripleComponent subject = std::move(turtleTriple._subject); + TripleComponent predicate = std::move(turtleTriple._predicate); + TripleComponent object = std::move(turtleTriple._object); + Id subjectId = std::move(subject).toValueId(index_.getVocab(), localVocab_); + Id predId = std::move(predicate).toValueId(index_.getVocab(), localVocab_); + Id objectId = std::move(object).toValueId(index_.getVocab(), localVocab_); + return IdTriple{subjectId, predId, objectId}; +} diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h new file mode 100644 index 0000000000..715a42d0d8 --- /dev/null +++ b/src/index/DeltaTriples.h @@ -0,0 +1,142 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include "engine/LocalVocab.h" +#include "global/IdTriple.h" +#include "index/Index.h" +#include "index/IndexBuilderTypes.h" +#include "index/LocatedTriples.h" +#include "index/Permutations.h" +#include "parser/TurtleParser.h" +#include "util/HashSet.h" + +// A class for maintaining triples that are inserted or deleted after index +// building, we call these delta triples. How it works in principle: +// +// 1. For each delta triple, find the location in each permutation (block index +// and index within that block, see `index/LocatedTriples.h`). +// +// 2. For each permutation and each block, store a sorted list of the positions +// of the delta triples within that block. +// +// 3. In the call of `PermutationImpl::scan`, use the respective lists to merge +// the relevant delta tripless into the index scan result. +// +// NOTE: The delta triples currently do not go well together with CACHING. See +// the discussion at the end of this file. +class DeltaTriples { + private: + // The index to which these triples are added. + const Index& index_; + + // The local vocabulary of the delta triples (they may have components, + // which are not contained in the vocabulary of the original index). + LocalVocab localVocab_; + + // The positions of the delta triples in each of the six permutations. + LocatedTriplesPerBlock locatedTriplesPerBlockInPSO_; + LocatedTriplesPerBlock locatedTriplesPerBlockInPOS_; + LocatedTriplesPerBlock locatedTriplesPerBlockInSPO_; + LocatedTriplesPerBlock locatedTriplesPerBlockInSOP_; + LocatedTriplesPerBlock locatedTriplesPerBlockInOSP_; + LocatedTriplesPerBlock locatedTriplesPerBlockInOPS_; + + // Each delta triple needs to know where it is stored in each of the six + // `LocatedTriplesPerBlock` above. + struct LocatedTripleHandles { + LocatedTriples::iterator forPSO; + LocatedTriples::iterator forPOS; + LocatedTriples::iterator forSPO; + LocatedTriples::iterator forSOP; + LocatedTriples::iterator forOPS; + LocatedTriples::iterator forOSP; + }; + + // The sets of triples added to and subtracted from the original index + // + // NOTE: The methods `insertTriple` and `deleteTriple` make sure that only + // triples are added that are not already contained in the original index and + // that only triples are subtracted that are contained in the original index. + // In particular, no triple can be in both of these sets. + ad_utility::HashMap triplesInserted_; + ad_utility::HashMap triplesDeleted_; + + public: + // Construct for given index. + DeltaTriples(const Index& index) : index_(index) {} + + // Get the `Index` to which these delta triples refer. + const Index& getIndex() const { return index_; } + + // Get the common `LocalVocab` of the delta triples. + LocalVocab& localVocab() { return localVocab_; } + const LocalVocab& localVocab() const { return localVocab_; } + + // Clear `_triplesAdded` and `_triplesSubtracted` and all associated data + // structures. + void clear(); + + // The number of delta triples added and subtracted. + size_t numInserted() const { return triplesInserted_.size(); } + size_t numDeleted() const { return triplesDeleted_.size(); } + + // Insert triple. + void insertTriple(TurtleTriple turtleTriple); + + // Delete triple. + void deleteTriple(TurtleTriple turtleTriple); + + // Get `TripleWithPosition` objects for given permutation. + const LocatedTriplesPerBlock& getTriplesWithPositionsPerBlock( + Permutation::Enum permutation) const; + + // TODO: made public as long as we are trying to figure out how this works. + private: + public: + // Get triples of `Id`s from `TurtleTriple` (which is the kind of triple we + // get from `TurtleParser`, see the code currently handling insertions and + // deletions in `Server.cpp`). + // + // NOTE: This is not `const` because translating to IDs may augment the local + // vocabulary. + IdTriple getIdTriple(const TurtleTriple& turtleTriple); + + // Find the position of the given triple in the given permutation and add it + // to each of the six `LocatedTriplesPerBlock` maps (one per permutation). + // Return the iterators of where it was added (so that we can easily delete it + // again from these maps later). + // + // TODO: The function is name is misleading, since this method does not only + // locate, but also add to the mentioned data structures. + LocatedTripleHandles locateTripleInAllPermutations(const IdTriple& idTriple); + + // Erase `LocatedTriple` object from each `LocatedTriplesPerBlock` list. The + // argument are iterators for each list, as returned by the method + // `locateTripleInAllPermutations` above. + // + // NOTE: The iterators are invalid afterwards. That is OK, as long as we also + // delete the respective entry in `triplesInserted_` or `triplesDeleted_`, + // which stores these iterators. + void eraseTripleInAllPermutations(LocatedTripleHandles& handles); +}; + +// DELTA TRIPLES AND THE CACHE +// +// For now, our approach only works when the results of index scans are not +// cached (unless there are no relevant delta triples for a particular scan). +// There are two ways how this can play out in the future: +// +// Either we generally do not cache the results of index scans anymore. This +// would have various advantages, in particular, joining with something like +// `rdf:type` would then be possible without storing the whole relation in +// RAM. However, we need a faster decompression then and maybe a smaller block +// size (currently 8 MB). +// +// Or we add the delta triples when iterating over the cached (uncompressed) +// result from the index scan. In that case, we would need to (in Step 1 above) +// store and maintain the positions in those uncompressed index scans. However, +// this would only work for the results of index scans. For the results of more +// complex subqueries, it's hard to figure out which delta triples are relevant. diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 7d41eb8fdf..4925d1c936 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -8,8 +8,10 @@ #include "./IndexImpl.h" -// ____________________________________________________________ -Index::Index() : pimpl_{std::make_unique()} {} +// _____________________________________________________________________________ +Index::Index() + : deltaTriples_{std::make_unique(*this)}, + pimpl_{std::make_unique(std::move(deltaTriples_))} {} Index::Index(Index&&) noexcept = default; // Needs to be in the .cpp file because of the unique_ptr to a forwarded class. @@ -17,6 +19,18 @@ Index::Index(Index&&) noexcept = default; // https://stackoverflow.com/questions/13414652/forward-declaration-with-unique-ptr Index::~Index() = default; +// _____________________________________________________________________________ +IndexImpl& Index::getImpl() { return *pimpl_; } +[[nodiscard]] const IndexImpl& Index::getImpl() const { return *pimpl_; } + +// _____________________________________________________________________________ +[[nodiscard]] DeltaTriples& Index::deltaTriples() { + return pimpl_->deltaTriples(); +} +[[nodiscard]] const DeltaTriples& Index::deltaTriples() const { + return pimpl_->deltaTriples(); +} + // ___________________________________________________________ template void Index::createFromFile(const std::string& filename) { diff --git a/src/index/Index.h b/src/index/Index.h index a071c7b730..7a884492f2 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -3,6 +3,7 @@ // Author: // 2014-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) + #pragma once #include @@ -21,6 +22,7 @@ class IdTable; class TextBlockMetaData; class IndexImpl; +class DeltaTriples; /** * Used as a template argument to the `createFromFile` method, when we do not @@ -31,6 +33,18 @@ class TurtleParserAuto {}; class Index { private: + // A unique `DeltaTriples` object will be created when this `Index` object is + // constructed and then immediately passed on to the `IndexImpl` (see the next + // member). This order is important because the `DeltaTriples` object needs to + // know the `Index` to which it pertains, and the `IndexImpl` needs access to + // the `DeltaTriples` when scanning permutations. + // + // NOTE: The `Index` and `IndexImpl` class could also share a pointer to the + // same `DeltaTriples` object, but it seems more correct to have a unique + // pointer, which only `IndexImpl` owns. Note how the `deltaTriples` getter + // below accesses the object via `pimpl_`. + std::unique_ptr deltaTriples_; + // Pimpl to reduce compile times. std::unique_ptr pimpl_; @@ -73,6 +87,9 @@ class Index { // Get underlying access to the Pimpl where necessary. const IndexImpl& getPimpl() const { return *pimpl_; } + // Use delta triples (the default is not to use them). + void useDeltaTriples(); + // Create an index from a file. Will write vocabulary and on-disk index data. // NOTE: The index can not directly be used after this call, but has to be // setup by `createFromOnDiskIndex` after this call. @@ -314,6 +331,10 @@ class Index { // Get access to the implementation. This should be used rarely as it // requires including the rather expensive `IndexImpl.h` header - IndexImpl& getImpl() { return *pimpl_; } - [[nodiscard]] const IndexImpl& getImpl() const { return *pimpl_; } + IndexImpl& getImpl(); + [[nodiscard]] const IndexImpl& getImpl() const; + + // Get acces to the delta triples. + [[nodiscard]] DeltaTriples& deltaTriples(); + [[nodiscard]] const DeltaTriples& deltaTriples() const; }; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index aaff5e8d56..afa5b9e4ff 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -30,7 +30,8 @@ using std::array; // _____________________________________________________________________________ -IndexImpl::IndexImpl() = default; +IndexImpl::IndexImpl(std::unique_ptr deltaTriples) + : deltaTriples_(std::move(deltaTriples)) {} // _____________________________________________________________________________ template diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a920579302..fe0e998b56 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -5,33 +5,6 @@ // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include #include #include @@ -42,6 +15,34 @@ #include #include +#include "engine/ResultTable.h" +#include "global/Pattern.h" +#include "index/CompressedRelation.h" +#include "index/ConstantsIndexBuilding.h" +#include "index/DeltaTriples.h" +#include "index/DocsDB.h" +#include "index/Index.h" +#include "index/IndexBuilderTypes.h" +#include "index/IndexMetaData.h" +#include "index/PatternCreator.h" +#include "index/Permutations.h" +#include "index/StxxlSortFunctors.h" +#include "index/TextMetaData.h" +#include "index/Vocabulary.h" +#include "index/VocabularyGenerator.h" +#include "parser/ContextFileParser.h" +#include "parser/TripleComponent.h" +#include "parser/TurtleParser.h" +#include "util/BackgroundStxxlSorter.h" +#include "util/BufferedVector.h" +#include "util/CompressionUsingZstd/ZstdWrapper.h" +#include "util/File.h" +#include "util/Forward.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/Timer.h" +#include "util/json.h" + using ad_utility::BufferedVector; using ad_utility::MmapVector; using ad_utility::MmapVectorView; @@ -126,6 +127,10 @@ class IndexImpl { off_t currenttOffset_; mutable ad_utility::File textIndexFile_; + // Reference to the delta triples from the `Index` class of which this class + // is the implementation. + std::unique_ptr deltaTriples_; + // If false, only PSO and POS permutations are loaded and expected. bool loadAllPermutations_ = true; @@ -160,15 +165,40 @@ class IndexImpl { // TODO: make those private and allow only const access // instantiations for the six permutations used in QLever. // They simplify the creation of permutations in the index class. - Permutation pos_{"POS", ".pos", {1, 2, 0}}; - Permutation pso_{"PSO", ".pso", {1, 0, 2}}; - Permutation sop_{"SOP", ".sop", {0, 2, 1}}; - Permutation spo_{"SPO", ".spo", {0, 1, 2}}; - Permutation ops_{"OPS", ".ops", {2, 1, 0}}; - Permutation osp_{"OSP", ".osp", {2, 0, 1}}; + Permutation pos_{ + "POS", + ".pos", + {1, 2, 0}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::POS)}; + Permutation pso_{ + "PSO", + ".pso", + {1, 0, 2}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::PSO)}; + Permutation sop_{ + "SOP", + ".sop", + {0, 2, 1}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::SOP)}; + Permutation spo_{ + "SPO", + ".spo", + {0, 1, 2}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::SPO)}; + Permutation ops_{ + "OPS", + ".ops", + {2, 1, 0}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::OPS)}; + Permutation osp_{ + "OSP", + ".osp", + {2, 0, 1}, + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::OSP)}; public: - IndexImpl(); + IndexImpl(std::unique_ptr deltaTriples = + std::unique_ptr()); // Forbid copying. IndexImpl& operator=(const IndexImpl&) = delete; @@ -196,6 +226,9 @@ class IndexImpl { Permutation& getPermutation(Permutation::Enum p); const Permutation& getPermutation(Permutation::Enum p) const; + const DeltaTriples& deltaTriples() const { return *deltaTriples_; } + DeltaTriples& deltaTriples() { return *deltaTriples_; } + // Creates an index from a file. Parameter Parser must be able to split the // file's format into triples. // Will write vocabulary and on-disk index data. diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 4e3ef4b38f..3039c0ba28 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -1,6 +1,7 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + #pragma once #include @@ -13,14 +14,14 @@ #include #include -#include "../global/Id.h" -#include "../util/File.h" -#include "../util/HashMap.h" -#include "../util/MmapVector.h" -#include "../util/ReadableNumberFact.h" -#include "../util/Serializer/Serializer.h" -#include "./MetaDataHandler.h" -#include "CompressedRelation.h" +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "index/MetaDataHandler.h" +#include "util/File.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/ReadableNumberFact.h" +#include "util/Serializer/Serializer.h" using std::array; using std::pair; @@ -86,7 +87,10 @@ class IndexMetaData { // name and the variable name are terrible. // For each relation, its meta data. + public: MapType _data; + + private: // For each compressed block, its meta data. BlocksType _blockData; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp new file mode 100644 index 0000000000..acd6988675 --- /dev/null +++ b/src/index/LocatedTriples.cpp @@ -0,0 +1,349 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/LocatedTriples.h" + +#include + +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/Permutations.h" + +// ____________________________________________________________________________ +LocatedTriple LocatedTriple::locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation) { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: Since `_col2LastId` has been added to `CompressedBlockMetadata`, this + // can be computed without having to decompress any blocks. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block.col0LastId_ < triple[0]) { + return true; + } else if (block.col0LastId_ == triple[0]) { + if (block.col1LastId_ < triple[1]) { + return true; + } else if (block.col1LastId_ == triple[1]) { + return block.col2LastId_ < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set + // to `false`. + LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; + + // If all `Id`s from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and the special row index + // `NO_ROW_INDEX` (see how this is considered in `mergeTriples`). + if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); + return locatedTriple; + } + + // Read and decompress the block. + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Find the smallest relation `Id` that is not smaller than `id1` and get its + // metadata and the position of the first and last triple with that `Id` in + // the block. + // + // IMPORTANT: If relation `id1` exists in the index, but our triple is larger + // than all triples of that relation in the index and the last triple of that + // relation ends a block, then our block search above (correctly) landed us at + // the next block. We can detect this by checking whether the first relation + // `Id` of the block is larger than `id1` and then we should get the metadata + // for the `Id` and not for `id1` (which would pertain to a previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, + // which is relevant in the rare case where a triple is inserted with an + // `Id` for predicate that is not a new `Id`, but has not been used for a + // predicate in the original index. + // + // NOTE: Since we have already handled the case, where all `Id`s in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->col0FirstId_ > id1 ? matchingBlock->col0FirstId_ : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata.offsetInBlock_; + size_t offsetEnd = offsetBegin + relationMetadata.numRows_; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where + // we are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger `Id` and the position of the first + // triple of that relation is exactly the position we are looking for. + if (id == id1) { + locatedTriple.rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. + // Note that our default for `existsInIndex` was set to `false` above. + const size_t& i = locatedTriple.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + locatedTriple.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + locatedTriple.rowIndexInBlock = offsetBegin; + } + + // Return the result. + return locatedTriple; +} + +// ____________________________________________________________________________ +template +std::pair LocatedTriplesPerBlock::numTriplesImpl( + size_t blockIndex, Id id1, Id id2) const { + // If no located triples for `blockIndex` exist, there is no entry in `map_`. + if (!map_.contains(blockIndex)) { + return {0, 0}; + } + + // Otherwise iterate over all located triples and count how many of them exist + // in the index ("to be deleted") and how many are new ("to be inserted"). + size_t countExists = 0; + size_t countNew = 0; + for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { + // Helper lambda for increasing the right counter. + auto increaseCountIf = [&](bool increase) { + if (increase) { + if (locatedTriple.existsInIndex) { + ++countExists; + } else { + ++countNew; + } + } + }; + // Increase depending on the mode. + if constexpr (matchMode == MatchMode::MatchAll) { + increaseCountIf(true); + } else if constexpr (matchMode == MatchMode::MatchId1) { + increaseCountIf(locatedTriple.id1 == id1); + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + increaseCountIf(locatedTriple.id1 == id1 && locatedTriple.id2 == id2); + } + } + return {countNew, countExists}; +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples( + size_t blockIndex) const { + return numTriplesImpl(blockIndex); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1) const { + return numTriplesImpl(blockIndex, id1); +} + +// ____________________________________________________________________________ +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1, + Id id2) const { + return numTriplesImpl(blockIndex, id1, id2); +} + +// ____________________________________________________________________________ +template +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + // This method should only be called if there are located triples in the + // specified block. + AD_CONTRACT_CHECK(map_.contains(blockIndex)); + + // The special case `block == std::nullopt` (write only located triples to + // `result`) is only allowed, when `id1` or `id1` and `id2` are specified. + AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); + + // If `rowIndexInBlockEnd` has the default value (see `LocatedTriples.h`), the + // intended semantics is that we read the whole block (note that we can't have + // a default value that depends on the values of previous arguments). + if (rowIndexInBlockEnd == LocatedTriple::NO_ROW_INDEX && block.has_value()) { + rowIndexInBlockEnd = block.value().size(); + } + + // Check that `rowIndexInBlockBegin` and `rowIndexInBlockEnd` define a valid + // and non-emtpy range and that it is a subrange of `block` (unless the latter + // is `std::nullopt`). + if (block.has_value()) { + AD_CONTRACT_CHECK(rowIndexInBlockBegin < block.value().size()); + AD_CONTRACT_CHECK(rowIndexInBlockEnd <= block.value().size()); + } + AD_CONTRACT_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + + // If we restrict `id1` and `id2`, the index block and the result must have + // one column (for the `id3`). Otherwise, they must have two columns (for the + // `id2` and the `id3`). + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 1); + AD_CONTRACT_CHECK(result.numColumns() == 1); + } else { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 2); + AD_CONTRACT_CHECK(result.numColumns() == 2); + } + + auto resultEntry = result.begin() + offsetInResult; + const auto& locatedTriples = map_.at(blockIndex); + auto locatedTriple = locatedTriples.begin(); + + // Helper lambda that checks whether the given located triple should be + // considered, given the `matchMode`. + auto locatedTripleMatches = [&]() { + if constexpr (matchMode == MatchMode::MatchAll) { + return true; + } else if constexpr (matchMode == MatchMode::MatchId1) { + return locatedTriple->id1 == id1; + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + return locatedTriple->id1 == id1 && locatedTriple->id2 == id2; + } + }; + + // Advance to the first located triple in the specified range. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock < rowIndexInBlockBegin) { + ++locatedTriple; + } + + // Iterate over all located triples in the specified range. In the special + // case `block == std::nullopt` (only write located triples to `result`), all + // relevant located triples have `rowIndexInBlock == NO_ROW_INDEX` (here we + // need that `NO_ROW_INDEX` is the maximal `size_t` value minus one). + if (!block.has_value()) { + rowIndexInBlockBegin = LocatedTriple::NO_ROW_INDEX; + rowIndexInBlockEnd = rowIndexInBlockBegin + 1; + AD_CORRECTNESS_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + } + for (size_t rowIndex = rowIndexInBlockBegin; rowIndex < rowIndexInBlockEnd; + ++rowIndex) { + // Append triples that are marked for insertion at this `rowIndex` to the + // result. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == false) { + if (locatedTripleMatches()) { + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + (*resultEntry)[0] = locatedTriple->id3; + } else { + (*resultEntry)[0] = locatedTriple->id2; + (*resultEntry)[1] = locatedTriple->id3; + } + ++resultEntry; + } + ++locatedTriple; + } + + // Append the triple at this position to the result if and only if it is not + // marked for deletion and matches (also skip it if it does not match). + bool deleteThisEntry = false; + if (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == true) { + deleteThisEntry = locatedTripleMatches(); + ++locatedTriple; + } + if (block.has_value() && !deleteThisEntry) { + *resultEntry++ = block.value()[rowIndex]; + } + }; + + // Return the number of rows written to `result`. + return resultEntry - (result.begin() + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult) const { + return mergeTriples(blockIndex, std::move(block), result, + offsetInResult); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, + Id::makeUndefined(), rowIndexInBlockBegin); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, id2, + rowIndexInBlockBegin, rowIndexInBlockEnd); +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { + os << "LT(" << lt.blockIndex << " " + << (lt.rowIndexInBlock == LocatedTriple::NO_ROW_INDEX + ? "NO_ROW_INDEX" + : std::to_string(lt.rowIndexInBlock)) + << " " << lt.id1 << " " << lt.id2 << " " << lt.id3 << " " + << lt.existsInIndex << ")"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts) { + os << "{"; + std::copy(lts.begin(), lts.end(), + std::ostream_iterator(std::cout, " ")); + os << "}"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { + for (auto [blockIndex, lts] : ltpb.map_) { + os << "Block #" << blockIndex << ": " << lts << std::endl; + } + return os; +} diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h new file mode 100644 index 0000000000..bb967bfe95 --- /dev/null +++ b/src/index/LocatedTriples.h @@ -0,0 +1,196 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include "engine/idTable/IdTable.h" +#include "global/IdTriple.h" +#include "util/HashMap.h" + +class Permutation; + +// A triple and its location in a particular permutation. +// +// If a triple is not contained in the permutation, the location is the location +// of the next larger triple (which may be in the next block or beyond the last +// block). For a detailed definition of all border cases, see the definition at +// the end of this file. +// +// NOTE: Technically, `blockIndex` and the `existsInIndex` are redundant in this +// record because they can be derived when the class is used. However, they are +// useful for testing, and for a small nuber of delta triples (think millions), +// space efficiency is not a significant issue for this class. +struct LocatedTriple { + // The index of the block and the location within that block, according to the + // definition above. + size_t blockIndex; + size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. + Id id1; + Id id2; + Id id3; + // Flag that is true if and only if the triple exists in the permutation. It + // is then equal to the triple at the position given by `blockIndex` and + // `rowIndexInBlock`. + bool existsInIndex; + + // Locate the given triple in the given permutation. + static LocatedTriple locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation); + + // Special row index for triples that belong to the previous block (see the + // definition for the location of a triple at the end of this file). + // + // NOTE: It is important that `NO_ROW_INDEX + 1 > NO_ROW_INDEX`, hence it is + // defined as `max() - 1` and not as the seemingly more natural `max()`. + static const size_t NO_ROW_INDEX = std::numeric_limits::max() - 1; +}; + +// A sorted set of located triples. In `LocatedTriplesPerBlock` below, we use +// this to store all located triples with the same `blockIndex`. +// +// NOTE: We could also overload `std::less` here, but the explicit specification +// of the order makes it clearer. +struct LocatedTripleCompare { + bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { + return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; + } +}; +using LocatedTriples = std::set; + +// Sorted sets of located triples, grouped by block. We use this to store all +// located triples for a permutation. +class LocatedTriplesPerBlock { + private: + // The total number of `LocatedTriple` objects stored (for all blocks). + size_t numTriples_ = 0; + + public: + // For each block with a non-empty set of located triples, the located triples + // in that block. + // + // NOTE: This is currently not private because we want access to + // `map_.size()`, `map_.clear()`, `map_.contains(...)`, and `map_.at(...)`. + // We could also make `LocatedTriplesPerBlock` a subclass of `HashMap`, but not sure whether that is good style. + ad_utility::HashMap map_; + + public: + // Get the number of located triples for the given block that match `id1` (if + // provided) and `id2` (if provided). The return value is a pair of numbers: + // first, the number of existing triples ("to be deleted") and second, the + // number of new triples ("to be inserted"). + std::pair numTriples(size_t blockIndex) const; + std::pair numTriples(size_t blockIndex, Id id1) const; + std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; + + // Merge located triples for `blockIndex` with the given index `block` and + // write to `result`, starting from position `offsetInResult`. Consider only + // located triples in the range specified by `rowIndexInBlockBegin` and + // `rowIndexInBlockEnd`. Consider only triples that match `id1` (if provided) + // and `id2` (if provided). Return the number of rows written to `result`. + // + // PRECONDITIONS: + // + // 1. The set of located triples for `blockIndex` must be non-empty. + // Otherwise, there is no need for merging and this method shouldn't be + // called for efficiency reasons. + // + // 2. It is the resposibility of the caller that there is enough space for the + // result of the merge in `result` starting from `offsetInResult`. + // + // 3. If `block == std::nullopt`, we are adding to `result` the located + // triples for block `blockIndex` where the `rowIndexInBlock` is + // `NO_ROW_INDEX`. These actually belong to the previous block, but were + // larger than all triples there. This requires that `id1` or both `id1` and + // `id2` are specified. + // + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult) const; + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin = 0) const; + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1, Id id2, size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; + + // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. + // Return a handle to where it was added (`LocatedTriples` is a sorted set, + // see above). We need this handle so that we can easily remove the + // `locatedTriple` again from the set in case we need to. + // + // The `locatedTriple` must not already exist in `LocatedTriplesPerBlock`. + LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { + LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; + auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); + AD_CORRECTNESS_CHECK(wasInserted == true); + AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); + ++numTriples_; + return handle; + }; + + // Get the total number of `LocatedTriple` objects (for all blocks). + size_t numTriples() const { return numTriples_; } + + // Get the number of blocks with a non-empty set of located triples. + size_t numBlocks() const { return map_.size(); } + + // Remove all located triples. + void clear() { + map_.clear(); + numTriples_ = 0; + } + + private: + // Match modes for `numTriplesInBlockImpl` and `mergeTriplesIntoBlockImpl`. + enum struct MatchMode { MatchAll, MatchId1, MatchId1AndId2 }; + + // The Implementation behind the public method `numTriplesInBlock` above. + template + std::pair numTriplesImpl(size_t blockIndex, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; + + // The Implementation behind the public method `mergeTriplesIntoBlock` above. + // The only reason that the arguments `id1` and `id2` come at the end here is + // so that we can give them default values. + template + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined(), size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; +}; + +// Human-readable representation of `LocatedTriple`, `LocatedTriples`, and +// `LocatedTriplesPerBlock`, which are very useful for debugging. +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); + +// DEFINITION OF THE POSITION OF A LOCATED TRIPLE IN A PERMUTATION +// +// 1. The position is defined by the index of a block in the permutation and the +// index of a row within that block. +// +// 2. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then the +// position is the first position in that next block. +// +// 4. As a special case of 3, if the triple is smaller than all triples in the +// permutation, the position is the first position of the first block. +// +// 5. If the triple is larger than all triples in the permutation, the block +// index is one after the largest block index and the position within that +// non-existing block is arbitrary. diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index da84f1158a..e24e33fe5c 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -1,29 +1,39 @@ -// Copyright 2018, University of Freiburg, +// Copyright 2018 - 2023, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) -// +// Authors: Johannes Kalmbach +// Hannah Bast + #pragma once #include #include -#include "../global/Id.h" -#include "../util/Exception.h" -#include "../util/HashMap.h" -#include "../util/Iterators.h" -#include "../util/Log.h" -#include "../util/Serializer/Serializer.h" -#include "./CompressedRelation.h" - -// _____________________________________________________________________ +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/Iterators.h" +#include "util/Log.h" +#include "util/Serializer/Serializer.h" + +// Class for access to relation metadata stored in a vector. Specifically, our +// index uses this with `M = MmapVector>`; see +// `index/IndexMetaData.h` template class MetaDataWrapperDense { + private: + // A vector of metadata objects. + M _vec; + public: + // An iterator with an additional method `getId()` that gives the relation ID + // of the current metadata object. template struct AddGetIdIterator : BaseIterator { using BaseIterator::BaseIterator; AddGetIdIterator(BaseIterator base) : BaseIterator{base} {} [[nodiscard]] Id getId() const { return getIdFromElement(*(*this)); } + [[nodiscard]] const auto& getMetaData() const { return *(*this); } static Id getIdFromElement(const typename BaseIterator::value_type& v) { return v.col0Id_; } @@ -39,6 +49,7 @@ class MetaDataWrapperDense { // The underlying array is sorted, so all iterators are ordered iterators using ConstOrderedIterator = ConstIterator; + // The type of the stored metadata objects. using value_type = typename M::value_type; // _________________________________________________________ @@ -109,12 +120,24 @@ class MetaDataWrapperDense { // ___________________________________________________________ std::string getFilename() const { return _vec.getFilename(); } - private: + // The following used to be private (because they were only used as + // subroutines in the above), but we now need them in + // `DeltaTriples::findTripleResult`. ConstIterator lower_bound(Id id) const { auto cmp = [](const auto& metaData, Id id) { return metaData.col0Id_ < id; }; return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); } - M _vec; + Iterator lower_bound(Id id) { + auto cmp = [](const auto& metaData, Id id) { + return metaData.col0Id_ < id; + }; + return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); + } }; + +// ======= +// M _vec; +// }; +// >>>>>>> master diff --git a/src/index/Permutations.h b/src/index/Permutations.h index 63578d746e..e6319c2375 100644 --- a/src/index/Permutations.h +++ b/src/index/Permutations.h @@ -1,20 +1,30 @@ // Copyright 2018, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) + #pragma once #include #include #include "global/Constants.h" +// #include "index/DeltaTriples.h" #include "index/IndexMetaData.h" +#include "index/StxxlSortFunctors.h" #include "util/File.h" #include "util/Log.h" +using std::array; +using std::string; + // Helper class to store static properties of the different permutations to // avoid code duplication. The first template parameter is a search functor for // STXXL. class Permutation { + private: + // The delta triples and their positions in this permutation. + const LocatedTriplesPerBlock& locatedTriplesPerBlock_; + public: /// Identifiers for the six possible permutations. enum struct Enum { PSO, POS, SPO, SOP, OPS, OSP }; @@ -27,12 +37,16 @@ class Permutation { static constexpr auto OPS = Enum::OPS; static constexpr auto OSP = Enum::OSP; using MetaData = IndexMetaDataMmapView; - Permutation(string name, string suffix, array order) - : _readableName(std::move(name)), + Permutation(string name, string suffix, array order, + const LocatedTriplesPerBlock& locatedTriplesPerBlock) + : locatedTriplesPerBlock_(locatedTriplesPerBlock), + _readableName(std::move(name)), _fileSuffix(std::move(suffix)), _keyOrder(order) {} - // everything that has to be done when reading an index from disk + // Initialize this permutation based on its index file(s) on disk. For PSO and + // PSO, this is one file named `.index.pos` or `.index.pso`, respectively. For + // the other permutations, there is also a `.meta` file. void loadFromDisk(const std::string& onDiskBase) { if constexpr (MetaData::_isMmapBased) { _meta.setup(onDiskBase + ".index" + _fileSuffix + MMAP_FILE_SUFFIX, @@ -52,9 +66,9 @@ class Permutation { _isLoaded = true; } - /// For a given ID for the first column, retrieve all IDs of the second and - /// third column, and store them in `result`. This is just a thin wrapper - /// around `CompressedRelationMetaData::scan`. + // For a given relation `Id` (first column), retrieve all `Id`s of the second + // and third column, and store them in `result`. This is just a thin wrapper + // around the corresponding `CompressedRelationMetaData::scan`. template void scan(Id col0Id, IdTableImpl* result, ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const { @@ -65,23 +79,25 @@ class Permutation { if (!_meta.col0IdExists(col0Id)) { return; } - const auto& metaData = _meta.getMetaData(col0Id); - return _reader.scan(metaData, _meta.blockData(), _file, result, - std::move(timer)); + const auto& metadataForRelation = _meta.getMetaData(col0Id); + const auto& metadataForAllBlocks = _meta.blockData(); + return _reader.scan(metadataForRelation, metadataForAllBlocks, _file, + result, std::move(timer), locatedTriplesPerBlock_); } - /// For given IDs for the first and second column, retrieve all IDs of the - /// third column, and store them in `result`. This is just a thin wrapper - /// around `CompressedRelationMetaData::scan`. + + // For a given relation `Id` (first column) and `Id` for the second column, + // retrieve all `Id`s of the third column, and store them in `result`. Also + // just a wrapper around the corresponding `CompressedRelationMetaData::scan`. template void scan(Id col0Id, Id col1Id, IdTableImpl* result, ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const { if (!_meta.col0IdExists(col0Id)) { return; } - const auto& metaData = _meta.getMetaData(col0Id); - - return _reader.scan(metaData, col1Id, _meta.blockData(), _file, result, - timer); + const auto& metadataForRelation = _meta.getMetaData(col0Id); + const auto& metadataForAllBlocks = _meta.blockData(); + return _reader.scan(metadataForRelation, col1Id, metadataForAllBlocks, + _file, result, timer, locatedTriplesPerBlock_); } // _______________________________________________________ diff --git a/src/parser/TurtleParser.h b/src/parser/TurtleParser.h index 1af252d401..e0ecd868cb 100644 --- a/src/parser/TurtleParser.h +++ b/src/parser/TurtleParser.h @@ -42,6 +42,11 @@ struct TurtleTriple { TripleComponent _object; bool operator==(const TurtleTriple&) const = default; + + // Return a string representation (used for error message in `DeltaTriples`). + std::string toString() { + return absl::StrCat(_subject, " ", _predicate, " ", _object.toString()); + } }; inline std::string_view stripAngleBrackets(std::string_view input) { diff --git a/src/util/AllocatorWithLimit.h b/src/util/AllocatorWithLimit.h index d45f6499bd..16185faa2b 100644 --- a/src/util/AllocatorWithLimit.h +++ b/src/util/AllocatorWithLimit.h @@ -89,8 +89,8 @@ class AllocationMemoryLeftThreadsafe { }; } // namespace detail -// setup a shared Allocation state. For the usage see documentation of the -// Limited Allocator class +// Setup a shared Allocation state. For the usage see documentation of the +// `AllocatorWithLimit` class. inline detail::AllocationMemoryLeftThreadsafe makeAllocationMemoryLeftThreadsafeObject(size_t n) { return detail::AllocationMemoryLeftThreadsafe{std::make_shared< diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f94b54c063..30796df150 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -104,6 +104,10 @@ addLinkAndDiscoverTest(IndexMetaDataTest index) # TODO fix this addLinkAndDiscoverTestSerial(IndexTest index) +addLinkAndDiscoverTestSerial(DeltaTriplesTest index) + +addLinkAndDiscoverTestSerial(LocatedTriplesTest index) + addLinkAndDiscoverTest(FTSAlgorithmsTest index) addLinkAndDiscoverTest(EngineTest engine) diff --git a/test/CompressedRelationsTest.cpp b/test/CompressedRelationsTest.cpp index 64e40d80de..e53d0f9b07 100644 --- a/test/CompressedRelationsTest.cpp +++ b/test/CompressedRelationsTest.cpp @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #include @@ -103,8 +103,8 @@ void testCompressedRelations(const std::vector& inputs, ad_utility::File file{filename, "r"}; auto timer = std::make_shared( ad_utility::TimeoutTimer::unlimited()); - // Check the contents of the metadata. + // Check the contents of the metadata. CompressedRelationReader reader; for (size_t i = 0; i < metaData.size(); ++i) { const auto& m = metaData[i]; diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp new file mode 100644 index 0000000000..b91d8a6dc1 --- /dev/null +++ b/test/DeltaTriplesTest.cpp @@ -0,0 +1,544 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./IndexTestHelpers.h" +#include "./util/GTestHelpers.h" +#include "absl/strings/str_split.h" +#include "engine/ExportQueryExecutionTrees.h" +#include "index/DeltaTriples.h" +#include "index/IndexImpl.h" +#include "index/Permutations.h" +#include "parser/TurtleParser.h" + +// Shortcuts to these full type names used frequently in the following. +// using IdTriple; +static const std::vector permutationEnums = { + Permutation::PSO, Permutation::POS, Permutation::SPO, + Permutation::SOP, Permutation::OPS, Permutation::OSP}; + +// Fixture that sets up a test index. +class DeltaTriplesTest : public ::testing::Test { + protected: + // The triples in our test index (as a `std::vector` so that we have easy + // access to each triple separately in the tests below). + static constexpr const char* testTurtle = + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " "; + + // Query execution context with index for testing, see `IndexTestHelpers.h`. + QueryExecutionContext* testQec = ad_utility::testing::getQec(testTurtle); + + // The individual triples (useful for testing below). + // + // NOTE: It looks like it would make more send to define a static class member + // of type `std::vector` in the first place that contains the + // individual triples and then concatenate them with `absl::StrJoin` for + // `getQec`, but C++ doesn't allow non-literal static class members. + std::vector getTestTriples() { + return absl::StrSplit(testTurtle, " . "); + } + + // Make `TurtleTriple` from given Turtle input. + TurtleTriple makeTurtleTriple(std::string_view turtle) { + TurtleStringParser parser; + parser.parseUtf8String(std::string{turtle}); + AD_CONTRACT_CHECK(parser.getTriples().size() == 1); + return parser.getTriples()[0]; + } + + // Make `IdTriple` from given Turtle input (the first argument is not `const` + // because we might change the local vocabulary). + IdTriple makeIdTriple(DeltaTriples& deltaTriples, std::string turtle) { + return deltaTriples.getIdTriple(makeTurtleTriple(std::move(turtle))); + } + + // Resolve the name for the given `Id` using the `Index` and `LocalVocab` from + // the given `deltaTriples` object. + std::string getNameForId(Id id, const DeltaTriples& deltaTriples) const { + auto lookupResult = ExportQueryExecutionTrees::idToStringAndType( + deltaTriples.getIndex(), id, deltaTriples.localVocab()); + AD_CONTRACT_CHECK(lookupResult.has_value()); + const auto& [value, type] = lookupResult.value(); + // std::ostringstream os; + // os << "[" << id << "]"; + return type ? absl::StrCat("\"", value, "\"^^<", type, ">") : value; + // : absl::StrCat(value, " ", os.str()); + }; + + // Get human-readable names for the given `permutation` and `idTriple`. This + // is needed for proper message when an assert fails in the tests below. The + // `idTriple` is assumed to be already in the right permutation (for example, + // for POS, `idTriple[0]` is the `Id` of the predicate). + template + std::pair getNicePermutationAndTripleName( + const DeltaTriples& deltaTriples, const Permutation& permutation, + const IdTriple idTriple) { + auto& namePermutation = permutation._readableName; + std::string nameId1 = getNameForId(idTriple[0], deltaTriples); + std::string nameId2 = getNameForId(idTriple[1], deltaTriples); + std::string nameId3 = getNameForId(idTriple[2], deltaTriples); + std::string nameTriple = + absl::StrCat(std::string{namePermutation[0]}, "=", nameId1, " ", + std::string{namePermutation[1]}, "=", nameId2, " ", + std::string{namePermutation[2]}, "=", nameId3); + return {namePermutation, nameTriple}; + } + + // Check that all six `triplesWithPositionsPerBlock` lists have the given + // number of `LocatedTriple` objects. + void checkTriplesWithPositionsPerBlockSize(const DeltaTriples& deltaTriples, + size_t expectedSize) { + for (Permutation::Enum permutation : permutationEnums) { + ASSERT_EQ(deltaTriples.getTriplesWithPositionsPerBlock(permutation) + .numTriples(), + expectedSize); + } + } + + // Get the complete sequence of "relation" (most significant) `Id`s for the + // given permutation. The result is a `std::vector` of `std::vector`, + // where the index into the outer vector is a block index, and each inner + // vector is as large as the corresponding block. + // + // NOTE: To save index storage space, these `Id`s are not stored explicitly in + // the blocks, but implicitly in the `CompressedRelationMetadata` objects of a + // permutation. For our test of `locateTripleInAllPermutations` below, we need + // random access to these `Id`s. + template + std::vector> getAllRelationIdsForPermutation( + const Permutation& permutation) { + // The metadata for each block (since our blocks are large, this is not a + // lot of data). + const std::vector& metadataPerBlock = + permutation._meta.blockData(); + + // Make room for the `Id`s in our final result: one `std::vector`` per + // block, and each of these is as large as the respective block. + std::vector> result(metadataPerBlock.size()); + for (size_t i = 0; i < result.size(); ++i) { + result[i].resize(metadataPerBlock[i].numRows_, Id::makeUndefined()); + } + + // Iterate over all relations. + // + // NOTE: The metadata per "relation" is stored as a hash map for POS and PSO + // (where there are typically few distinct "relations", that is, + // predicates), and as a vector for the other four permutations (there are + // typically many distinct subjects and objects). Whatever the type, we can + // always iterate over the complete set, see `MetaDataHandler.h`. + const auto& metadataPerRelation = permutation._meta._data; + for (auto it = metadataPerRelation.begin(); it != metadataPerRelation.end(); + ++it) { + // Get the `Id` of this relation, and where it starts in its (at this + // unknown) block, and how many triples it has overall. + const CompressedRelationMetadata& relationMetadata = it.getMetaData(); + Id relationId = relationMetadata.col0Id_; + size_t offsetInBlock = relationMetadata.offsetInBlock_; + size_t numTriples = relationMetadata.numRows_; + + // Find the index of the first block that contains triples from this + // relation. + const auto block = std::lower_bound( + metadataPerBlock.begin(), metadataPerBlock.end(), relationId, + [&](const CompressedBlockMetadata& block, const Id& id) -> bool { + return block.col0LastId_ < id; + }); + size_t blockIndex = block - metadataPerBlock.begin(); + AD_CORRECTNESS_CHECK(blockIndex < metadataPerBlock.size()); + AD_CORRECTNESS_CHECK(block->col0FirstId_ <= relationId); + AD_CORRECTNESS_CHECK(block->col0LastId_ >= relationId); + + // If the relation fits into a single block, we need to write the relation + // `Id` only in one block of our result. Otherwise, we have a sequence of + // blocks for only that relation `Id`. + if (offsetInBlock != std::numeric_limits::max()) { + AD_CORRECTNESS_CHECK(offsetInBlock + numTriples <= block->numRows_); + for (size_t i = offsetInBlock; i < offsetInBlock + numTriples; ++i) { + result[blockIndex][i] = relationId; + } + } else { + size_t count = 0; + while (blockIndex < metadataPerBlock.size() && + metadataPerBlock[blockIndex].col0FirstId_ == relationId) { + const auto& block = metadataPerBlock[blockIndex]; + AD_CORRECTNESS_CHECK(block.col0LastId_ == relationId); + for (size_t i = 0; i < block.numRows_; ++i) { + result[blockIndex][i] = relationId; + } + ++blockIndex; + count += block.numRows_; + } + AD_CORRECTNESS_CHECK(count == numTriples); + } + } + + // Check that all slots in `result` have been written and then return it. + for (const auto& resultBlock : result) { + for (const Id& id : resultBlock) { + AD_CORRECTNESS_CHECK(id != Id::makeUndefined()); + } + } + return result; + } +}; + +// Print relation `Id`s for selected permutation (for debugging only). +TEST_F(DeltaTriplesTest, showAllRelationIdsForPermutation) { + bool runThisTest = false; + if (runThisTest) { + // Compute relation `Id`s for POS (choose another premutation if you wish). + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + const auto& permutation = index.getImpl().POS(); + const std::vector> allRelationIdsForPermutation = + getAllRelationIdsForPermutation(permutation); + + // Show them per block. + std::cout << endl; + std::cout << "All relation IDs for permutation " + << permutation._readableName << ":" << std::endl; + size_t blockCount = 0; + for (const auto& block : allRelationIdsForPermutation) { + std::cout << "Block #" << (++blockCount) << ":"; + for (const Id& id : block) { + std::cout << " " + << (id != Id::makeUndefined() ? getNameForId(id, deltaTriples) + : "UNDEF") + << std::flush; + } + std::cout << std::endl; + } + std::cout << std::endl; + } +} + +// Test the constructor. +TEST_F(DeltaTriplesTest, constructor) { + DeltaTriples deltaTriples(testQec->getIndex()); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 0); +} + +// Test clear after inserting or deleting a few triples. +TEST_F(DeltaTriplesTest, clear) { + // Insert then clear. + DeltaTriples deltaTriples(testQec->getIndex()); + deltaTriples.insertTriple(makeTurtleTriple(" ")); + ASSERT_EQ(deltaTriples.numInserted(), 1); + ASSERT_EQ(deltaTriples.numDeleted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 1); + deltaTriples.clear(); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 0); + + // Delete then clear. + deltaTriples.deleteTriple(makeTurtleTriple(" ")); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 1); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 1); + deltaTriples.clear(); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numInserted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 0); +} + +// Check that insert and delete work as they should. The core of this test is to +// check that `locateTripleInPermutation` and `locateTripleInAllPermutations` +// work correctly. +// +// TODO: Wouldn't it make more sense to test the mentioned functions instead of +// `insertTriple` and `deleteTriple`? +TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + + // Check the given `locatedTriple` (a block index, an index in the + // block, and a triple) is correct for the given permutation as follows: + // + // 1. If `locatedTriple.existsInIndex == true`, check that the + // triple indeed occurs at that position in the respective triple. + // + // 2. If `locatedTriple.existsInIndex == false`, check that the + // triple at the position is larger and the triple at the previous + // position is smaller. + auto checkTripleWithPositionInPermutation = + [&](const LocatedTriple& locatedTriple, const auto& permutation, + const std::vector>& relationIdsPerBlock) { + // Shortcuts for the tiples ids and its position. + const size_t blockIndex = locatedTriple.blockIndex; + const size_t rowIndexInBlock = locatedTriple.rowIndexInBlock; + const bool existsInIndex = locatedTriple.existsInIndex; + const IdTriple deltaTriple{locatedTriple.id1, locatedTriple.id2, + locatedTriple.id3}; + + // Members for accessing the data of a permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Prepare a message for when one of our assertions fails, with nice + // names for the permutation and the `deltaTriple`. + auto [namePermutation, nameTriple] = getNicePermutationAndTripleName( + deltaTriples, permutation, deltaTriple); + std::string msg = + absl::StrCat("Permutation ", namePermutation, ", triple ", + nameTriple, ", block index ", blockIndex, + ", row index in block ", rowIndexInBlock, "\n"); + + // If the `blockIndex` is beyond the last index, check the following: + // + // 1. The delta triple does not exist in the index + // 2. The delta triple is larger than all triples in the index + // 3. Exit this test (there is nothing more to test in that case) + const vector& metadataPerBlock = + meta.blockData(); + AD_CONTRACT_CHECK(metadataPerBlock.size() > 0); + IdTriple lastTriple{metadataPerBlock.back().col0LastId_, + metadataPerBlock.back().col1LastId_, + metadataPerBlock.back().col2LastId_}; + if (blockIndex >= metadataPerBlock.size()) { + ASSERT_EQ(blockIndex, metadataPerBlock.size()) << msg; + ASSERT_FALSE(existsInIndex); + ASSERT_GT(deltaTriple, lastTriple); + return; + } + + // Read the triple at the block position and at the previous position + // (which might be in the previous block). + // + // TODO: We assume here that `Id::makeUndefined()` is strictly smaller + // than any regular `Id`. Is that correct? + // + // NOTE: When `blockIndex` is valid (we have handled the other case + // already above), `rowIndexInBlock` should always be a valid index into + // the block (and never one too large); check the semantics of + // `locateTripleInAllPermutations`. + const auto& blockMetadata = metadataPerBlock.at(blockIndex); + const auto& blockTuples = + reader.readAndDecompressBlock(blockMetadata, file, std::nullopt); + ASSERT_LT(rowIndexInBlock, blockTuples.size()) << msg; + IdTriple blockTriple{relationIdsPerBlock[blockIndex][rowIndexInBlock], + blockTuples(rowIndexInBlock, 0), + blockTuples(rowIndexInBlock, 1)}; + auto blockTriplePrevious = [&]() -> IdTriple { + if (rowIndexInBlock > 0) { + return IdTriple{ + relationIdsPerBlock[blockIndex][rowIndexInBlock - 1], + blockTuples(rowIndexInBlock - 1, 0), + blockTuples(rowIndexInBlock - 1, 1)}; + } else if (blockIndex > 0) { + return IdTriple{metadataPerBlock[blockIndex - 1].col0LastId_, + metadataPerBlock[blockIndex - 1].col1LastId_, + metadataPerBlock[blockIndex - 1].col2LastId_}; + } else { + return IdTriple{Id::makeUndefined(), Id::makeUndefined(), + Id::makeUndefined()}; + } + }(); + + // Now we can check whether our delta triple is exactly at the right + // location. + if (existsInIndex) { + ASSERT_EQ(blockTriple, deltaTriple) << msg; + ASSERT_LT(blockTriplePrevious, deltaTriple) << msg; + } else { + ASSERT_GT(blockTriple, deltaTriple) << msg; + ASSERT_LT(blockTriplePrevious, deltaTriple) << msg; + } + }; + + // Check that all `locatedTriple` in `positionsPerBlock` are + // correct for the given permutation. + auto checkAllTriplesWithPositionsForPermutation = + [&](const LocatedTriplesPerBlock& triplesWithPositionsPerBlock, + const auto& permutation) { + std::vector> allRelationIdsForPermutation = + getAllRelationIdsForPermutation(permutation); + for (const auto& [blockIndex, triplesWithPositions] : + triplesWithPositionsPerBlock.map_) { + for (const auto& locatedTriple : triplesWithPositions) { + checkTripleWithPositionInPermutation(locatedTriple, permutation, + allRelationIdsForPermutation); + } + } + }; + + // Check that all `locatedTriple`s are correct (for all + // permutations). the given permutation. + auto checkAllTriplesWithPositionForAllPermutations = + [&](const DeltaTriples& deltaTriples) { + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::POS), + index.getImpl().POS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::PSO), + index.getImpl().PSO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::SPO), + index.getImpl().SPO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::SOP), + index.getImpl().SOP()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::OPS), + index.getImpl().OPS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::OSP), + index.getImpl().OSP()); + }; + + // Check if each existing triple is located correctly in every + // permutation. + // + // TODO: Check that `existsInIndex` was set correctly. Test test routine + // above just take it from the tested `LocatedTriple` objects + // (which might be wrong) + // + // TODO: Check that each triple that was located was indeed added to + // each of the `LocatedTriplesPerBlock` objects. + // + // TODO: Eventually, we should test `insertTriple` and `deleteTriple`, + // which only insert a triple when it doesn't exist in the original + // index, and which only delete a triple, when it does exist in the + // original index. But let's first get `locateTripleInAllPermutations` + // correct. Note that to check whether a triple exists or not in the + // original index, looking at one permutation suffices. + const std::vector& testTriples = getTestTriples(); + for (std::string_view triple : testTriples) { + deltaTriples.deleteTriple(makeTurtleTriple(triple)); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Inserting the triples a second time should throw an exception (and not + // change anything about the internal data structures). + for (std::string_view triple : testTriples) { + AD_EXPECT_THROW_WITH_MESSAGE( + deltaTriples.deleteTriple(makeTurtleTriple(triple)), + ::testing::HasSubstr("this deletion therefore has no effect")); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Check that new triples are located correctly in every permutation. + for (std::string_view triple : testTriples) { + std::string newTriple{triple}; + newTriple[1] = 'X'; + deltaTriples.insertTriple(makeTurtleTriple(newTriple)); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, 2 * testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Deleting the triples a second time should throw an exception (and not + // change anything about the internal data structures). + for (std::string_view triple : testTriples) { + std::string newTriple{triple}; + newTriple[1] = 'X'; + AD_EXPECT_THROW_WITH_MESSAGE( + deltaTriples.insertTriple(makeTurtleTriple(newTriple)), + ::testing::HasSubstr("this insertion therefore has no effect")); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, 2 * testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); +} + +// Visualize the result of `findTripleInPermutation` for one particular +// triple by showing the whole block (for understanding and debugging +// only, this will eventually be deleted). +TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + // std::string tripleAsString = " "; + std::string tripleAsString = " "; + // std::string tripleAsString = " "; + // std::string tripleAsString = " "; + // std::string tripleAsString = " <0> "; + // std::string tripleAsString = " "; + std::cout << std::endl; + std::cout << "Searching the following triple: " << tripleAsString + << std::endl; + std::cout << "For each permutation, find the first element that is not " + "smaller" + << std::endl; + + // Search the triple in all permutations. + IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); + auto handles = deltaTriples.locateTripleInAllPermutations(idTriple); + + // Helper lambda for showing the block from the given permutation that + // contains the given (via an iterator) `LocatedTriple` object. + auto showBlock = [&](LocatedTriples::iterator& locatedTriple, + const auto& permutation) { + // Shortcuts for the triple and its position. + // AD_CORRECTNESS_CHECK(locatedTriple != locatedTriple.end()); + const size_t blockIndex = locatedTriple->blockIndex; + const size_t rowIndexInBlock = locatedTriple->rowIndexInBlock; + const bool existsInIndex = locatedTriple->existsInIndex; + const IdTriple deltaTriple{locatedTriple->id1, locatedTriple->id2, + locatedTriple->id3}; + + // Get nice names for the permutation and the triple. + auto [namePermutation, nameTriple] = + getNicePermutationAndTripleName(deltaTriples, permutation, deltaTriple); + + // If we are beyond the last block, there is nothing to show. + const vector& blockMetas = + permutation._meta.blockData(); + if (blockIndex >= blockMetas.size()) { + std::cout << endl; + std::cout << "All triples in " << namePermutation << " are smaller than " + << nameTriple << std::endl; + return; + } + + // Read the block and compute all relation `Id`s. + const CompressedBlockMetadata& blockMetadata = blockMetas[blockIndex]; + DecompressedBlock blockTuples = permutation._reader.readAndDecompressBlock( + blockMetadata, permutation._file, std::nullopt); + std::vector blockRelationIds = + getAllRelationIdsForPermutation(permutation).at(blockIndex); + AD_CORRECTNESS_CHECK(blockRelationIds.size() == blockTuples.size()); + + // Show the triples in the block. + std::cout << std::endl; + std::cout << "Block #" << blockIndex << " from " << namePermutation << " (" + << nameTriple << "):" << std::endl; + for (size_t i = 0; i < blockTuples.numRows(); ++i) { + std::cout << "Row #" << i << ": " + << getNameForId(blockRelationIds[i], deltaTriples); + for (size_t j = 0; j < blockTuples.numColumns(); ++j) { + std::cout << " " << getNameForId(blockTuples(i, j), deltaTriples); + } + if (i == rowIndexInBlock) { + std::cout << " <-- " + << (existsInIndex ? "existing triple" : "new triple"); + } + std::cout << std::endl; + } + }; + + // Show block for each permutation. + showBlock(handles.forPOS, index.getImpl().POS()); + showBlock(handles.forPSO, index.getImpl().PSO()); + showBlock(handles.forSPO, index.getImpl().SPO()); + showBlock(handles.forSOP, index.getImpl().SOP()); + showBlock(handles.forOSP, index.getImpl().OSP()); + showBlock(handles.forOPS, index.getImpl().OPS()); + std::cout << std::endl; +} diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp new file mode 100644 index 0000000000..822e7c4b0b --- /dev/null +++ b/test/LocatedTriplesTest.cpp @@ -0,0 +1,307 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./util/IdTableHelpers.h" +#include "./util/IdTestHelpers.h" +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/LocatedTriples.h" +#include "index/Permutations.h" + +// TODO: Why the namespace here? (copied from `test/IndexMetaDataTest.cpp`) +namespace { +auto V = ad_utility::testing::VocabId; +} + +// Fixture with helper functions. +class LocatedTriplesTest : public ::testing::Test { + protected: + // Make `LocatedTriplesPerBlock` from a list of `LocatedTriple` objects (the + // order in which the objects are given does not matter). + LocatedTriplesPerBlock makeLocatedTriplesPerBlock( + std::vector locatedTriples) { + LocatedTriplesPerBlock result; + for (auto locatedTriple : locatedTriples) { + result.add(locatedTriple); + } + return result; + } +}; + +// Test the method that counts the number of `LocatedTriple's in a block. +TEST_F(LocatedTriplesTest, numTriplesInBlock) { + // Set up lists of located triples for three blocks. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(10), V(1), V(0), true}, + LocatedTriple{1, 0, V(10), V(2), V(1), true}, + LocatedTriple{1, 0, V(11), V(3), V(0), false}, + LocatedTriple{2, 0, V(20), V(4), V(0), false}, + LocatedTriple{2, 0, V(21), V(5), V(0), false}, + LocatedTriple{3, 0, V(30), V(6), V(0), false}, + LocatedTriple{3, 0, V(32), V(7), V(0), true}}); + ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + + auto P = [](size_t n1, size_t n2) -> std::pair { + return {n1, n2}; + }; + + // Check the total counts per block. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), P(1, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), P(2, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), P(1, 1)); + + // Check the counts per block for a given `id1`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), P(0, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), P(0, 1)); + + // Check the counts per block for a given `id1` and `id2`. + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), P(0, 1)); +} + +// Test the method that merges the matching `LocatedTriple`s from a block into a +// part of an `IdTable`. +TEST_F(LocatedTriplesTest, mergeTriples) { + // A block, as it could come from an index scan. + IdTable block = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // A set of located triples for that block. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(1), V(10), V(10), true}, // Delete row 0 + LocatedTriple{1, 1, V(1), V(10), V(11), false}, // Insert before row 1 + LocatedTriple{1, 1, V(2), V(11), V(10), false}, // Insert before row 1 + LocatedTriple{1, 4, V(2), V(21), V(11), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(10), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(20), true}, // Delete row 4 + LocatedTriple{1, 5, V(3), V(30), V(30), true}}); // Delete row 5 + + // Merge all these triples into `block` and check that the result is as + // expected (four triples inserted and three triples deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 11}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}}); // Row 6 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` into `block` (three triples + // inserted and one triple deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}, // Row 6 + {30, 30}}); // Row 7 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2)); + ASSERT_EQ(result, resultExpected); + } + + // Repeat but with a partial block that leaves out the first two elements of + // `block`. + { + IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 + {20, 10}, // Row 1 + {21, 11}, // Row 2 + {30, 10}, // Row 3 + {30, 30}}); // Row 4 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2), 2); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the + // corresponding partial block (one triple inserted, one triple deleted). + { + IdTable blockColumnId3(1, ad_utility::testing::makeAllocator()); + blockColumnId3.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + blockColumnId3(i, 0) = block(i, 1); + } + IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); + IdTable result(1, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, std::move(blockColumnId3), result, 0, + V(2), V(30), 4, 6); + ASSERT_EQ(result, resultExpected); + } + + // Merge special triples. + { + size_t NRI = LocatedTriple::NO_ROW_INDEX; + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{2, NRI, V(1), V(30), V(40), true}, + LocatedTriple{2, NRI, V(1), V(30), V(50), true}, + LocatedTriple{2, NRI, V(1), V(40), V(10), true}}); + IdTable resultExpected = makeIdTableFromVector({{30, 40}, // Row 0 + {30, 50}, // Row 1 + {40, 10}}); // Row 2 + IdTable result(2, ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(2, std::nullopt, result, 0, V(1)); + } +} + +// Test `Permutation::scan` (and hence also `CompressedRelation::scan`) with +// triples merged from a `locatedTriplesPerBlock` object. +TEST_F(LocatedTriplesTest, scanWithMergeTriples) { + // The actual test, for a given block size. + auto testWithGivenBlockSize = [](const size_t blockSizeInBytes, + size_t numRelations, Id relationId) { + std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; + std::string permutationFilename = basename + ".index.pso"; + + // Helper lambda for creating a `BufferedIdTable` (which we need for + // `CompressedRelationWriter` from an ordinary `IdTable` with two columns). + // + // TODO: Something like this is also used in `CompressedRelationsTest`, so + // it should be in a helper class. + auto getBufferedIdTable = [](const IdTable& idTable) -> BufferedIdTable { + // Note that these files are never created because we set the threshold + // for writing to disk so large. + std::string bufferFilename1 = "compressedRelationWriter.buffer1.dat"; + std::string bufferFilename2 = "compressedRelationWriter.buffer2.dat"; + AD_CONTRACT_CHECK(idTable.numColumns() == 2); + BufferedIdTable bufferedIdTable{ + 2, + std::array{ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename1}, + ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename2}}}; + for (size_t i = 0; i < idTable.size(); ++i) { + bufferedIdTable.push_back({idTable(i, 0), idTable(i, 1)}); + } + return bufferedIdTable; + }; + + // Our test relation. + IdTable relation = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // Write the permutation to disk (adapted from `CompressedRelationsTest`, + // `IndexImpl::createPermutationPairImpl`, and `IndexImpl::). + { + ad_utility::File permutationFileForWritingRelations{permutationFilename, + "w"}; + IndexMetaDataMmap metadataMmap; + metadataMmap.setup(permutationFilename + MMAP_FILE_SUFFIX, + ad_utility::CreateTag{}); + CompressedRelationWriter writer{ + std::move(permutationFileForWritingRelations), blockSizeInBytes}; + for (size_t i = 1; i <= numRelations; ++i) { + // The third argument is the number of distinct elements. + auto relationMetadata = writer.addRelation( + V(i), getBufferedIdTable(relation), relation.size()); + metadataMmap.add(relationMetadata); + } + metadataMmap.blockData() = std::move(writer).getFinishedBlocks(); + ad_utility::File permutationFileForWritingMetadata(permutationFilename, + "r+"); + metadataMmap.appendToFile(&permutationFileForWritingMetadata); + } + + // Create a permutation based on this. + LocatedTriplesPerBlock locatedTriplesPerBlock; + Permutation permutation{"PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; + permutation.loadFromDisk(basename); + // ad_utility::File permutationFileForReading{permutationFilename, "r"}; + // permutation._file = std::move(permutationFileForReading); + // permutation._meta = metadata; + // permutation._isLoaded = true; + + // Read the (for this test: first and only) relation from disk and check + // that it is the same. + { + IdTable result(2, ad_utility::testing::makeAllocator()); + permutation.scan(relationId, &result); + ASSERT_EQ(result, relation); + } + + // Helper lambda for adding to `locatedTriplesPerBlock`. + auto locatedTriplesPerBlockAdd = [&locatedTriplesPerBlock, &relationId, + &permutation](Id id2, Id id3) { + locatedTriplesPerBlock.add(LocatedTriple::locateTripleInPermutation( + relationId, id2, id3, permutation)); + }; + + // Again, but with some located triples merged (three inserts, four + // deletes). + locatedTriplesPerBlockAdd(V(15), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(14), V(20)); // Insert. + locatedTriplesPerBlockAdd(V(20), V(10)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(30)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(31)); // Insert at very end. + locatedTriplesPerBlockAdd(V(30), V(32)); // Insert at very end. + std::cout << locatedTriplesPerBlock; + { + IdTable result(2, ad_utility::testing::makeAllocator()); + permutation.scan(relationId, &result); + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {14, 20}, // Row 1 + {15, 30}, // Row 2 + {30, 31}, // Row 3 + {30, 32}}); // Row 4 + ASSERT_EQ(result, resultExpected); + } + + // Now a scan where two `Id`s are fixed. + { + IdTable result(1, ad_utility::testing::makeAllocator()); + result.resize(2); + permutation.scan(relationId, V(30), &result); + IdTable resultExpected = makeIdTableFromVector({{31}, {32}}); + ASSERT_EQ(result, resultExpected); + } + + // Delete the file with the compressed relations. + ad_utility::deleteFile(permutationFilename); + }; + + // Now test for multiple block sizes (16 bytes is the minimum), relation + // sizes, and relations. + // + // TODO: Currently fails if `numRelations > 1`. + size_t numRelations = 1; + for (size_t i = 1; i <= numRelations; ++i) { + testWithGivenBlockSize(16, numRelations, V(i)); + testWithGivenBlockSize(32, numRelations, V(i)); + testWithGivenBlockSize(48, numRelations, V(i)); + testWithGivenBlockSize(64, numRelations, V(i)); + testWithGivenBlockSize(100'000, numRelations, V(i)); + } +} diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index dab815e207..3963e6eca5 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -278,15 +278,15 @@ TEST(ValueId, toDebugString) { stream << id; ASSERT_EQ(stream.str(), expected); }; - test(ValueId::makeUndefined(), "Undefined:Undefined"); - test(ValueId::makeFromInt(-42), "Int:-42"); - test(ValueId::makeFromDouble(42.0), "Double:42.000000"); - test(makeVocabId(15), "VocabIndex:15"); - test(makeLocalVocabId(25), "LocalVocabIndex:25"); - test(makeTextRecordId(37), "TextRecordIndex:37"); + test(ValueId::makeUndefined(), "U:xx"); + test(ValueId::makeFromInt(-42), "I:-42"); + test(ValueId::makeFromDouble(42.0), "D:42.000000"); + test(makeVocabId(15), "V:15"); + test(makeLocalVocabId(25), "L:25"); + test(makeTextRecordId(37), "T:37"); test(ValueId::makeFromDate( DateOrLargeYear{123456, DateOrLargeYear::Type::Year}), - "Date:123456"); + "D:123456"); } TEST(ValueId, InvalidDatatypeEnumValue) { diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index 44d95a3cc6..42e1ec7258 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -27,8 +27,8 @@ TEST(ValuesForTesting, valuesForTesting) { ASSERT_EQ(v.getMultiplicity(1), 84.0); ASSERT_THAT(v.asString(), - ::testing::StartsWith("Values for testing with 2 columns and " - "contents VocabIndex:3 VocabIndex:12")); + ::testing::StartsWith( + "Values for testing with 2 columns and contents V:3 V:12")); ASSERT_EQ(v.getDescriptor(), "explicit values for testing"); ASSERT_TRUE(v.resultSortedOn().empty()); ASSERT_TRUE(v.getChildren().empty()); diff --git a/test/util/AllocatorTestHelpers.h b/test/util/AllocatorTestHelpers.h index 0666c70488..c71b8687fe 100644 --- a/test/util/AllocatorTestHelpers.h +++ b/test/util/AllocatorTestHelpers.h @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once @@ -10,9 +10,9 @@ namespace ad_utility::testing { // Create an unlimited allocator. inline ad_utility::AllocatorWithLimit& makeAllocator() { - static ad_utility::AllocatorWithLimit a{ + static ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; - return a; + return allocator; } } // namespace ad_utility::testing