From 9a08db444c7a4966e4d2e239a4274b351b2eb81f Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Wed, 15 Mar 2023 21:11:03 +0100 Subject: [PATCH 01/20] First steps towards prototype for UPDATE 1. Add support for URL parameters "insert=..." and "delete=..." for inserting or deleting a single triple. 2. Stub of new class `DeltaTriples` that maintains the set of inserted and deleted triples. 3. First working implementation of a method `findTripleInPermutation` that for a given triple and a given permutation, finds the matching block in that permutation and the right position in that block. --- src/engine/Server.cpp | 53 +++++++++++++++++++++++++++++- src/engine/Server.h | 2 ++ src/index/CMakeLists.txt | 1 + src/index/CompressedRelation.h | 1 + src/index/ConstantsIndexBuilding.h | 9 +++-- src/index/IndexMetaData.h | 3 ++ test/CMakeLists.txt | 3 +- 7 files changed, 67 insertions(+), 5 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 197f79e9de..4f9a3b0f17 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -1,4 +1,4 @@ -// Copyright 2011 - 2022, University of Freiburg +// Copyright 2011 - 2023, University of Freiburg // Chair of Algorithms and Data Structures // Authors: Björn Buchhold // Johannes Kalmbach @@ -14,6 +14,7 @@ #include "absl/cleanup/cleanup.h" #include "engine/ExportQueryExecutionTrees.h" #include "engine/QueryPlanner.h" +#include "parser/TurtleParser.h" #include "util/BoostHelpers/AsyncWaitForFuture.h" template @@ -34,6 +35,7 @@ Server::Server(const int port, const int numThreads, size_t maxMemGB, }}, _sortPerformanceEstimator(), _index(), + _deltaTriples(_index), _engine(), _initialized(false), // The number of server threads currently also is the number of queries @@ -316,11 +318,58 @@ Awaitable Server::process( logCommand(cmd, "clear cache completely (including unpinned elements)"); _cache.clearAll(); response = createJsonResponse(composeCacheStatsJson(), request); + } else if (auto cmd = checkParameter("cmd", "clear-delta-triples")) { + logCommand(cmd, "clear delta triples"); + _deltaTriples.clear(); + response = createJsonResponse(composeCacheStatsJson(), request); } else if (auto cmd = checkParameter("cmd", "get-settings")) { logCommand(cmd, "get server settings"); response = createJsonResponse(RuntimeParameters().toMap(), request); } + // Insert or delete triples. + // + // TODO: This is a preliminary interface for testing. Eventually, this should + // be included in our SPARQL grammer (where the line `updateUnit : update;` at + // the beginning is currently commented out). + // + // TODO: For testing purposes, allow insertions and deletions without access + // token. Eventually, this should be restricted, of course, which can be + // easily done by adding the argument `accessTokenOk` to each of the calls for + // `checkParameter`. + { + bool insertDetected = false; + bool deleteDetected = false; + std::optional parameterValue; + if (parameterValue = checkParameter("insert", std::nullopt)) { + LOG(INFO) << "INSERT: " << parameterValue.value() << std::endl; + insertDetected = true; + } else if (parameterValue = checkParameter("delete", std::nullopt)) { + LOG(INFO) << "DELETE: " << parameterValue.value() << std::endl; + deleteDetected = true; + } + if (insertDetected || deleteDetected) { + AD_CORRECTNESS_CHECK(parameterValue.has_value()); + TurtleStringParser parser; + parser.parseUtf8String(parameterValue.value()); + if (parser.getTriples().size() == 0) { + throw std::runtime_error("Triple could not be parsed"); + } else if (parser.getTriples().size() > 1) { + throw std::runtime_error("Only one triple per call please"); + } + TurtleTriple turtleTriple = parser.getTriples()[0]; + if (insertDetected) { + _deltaTriples.insertTriple(std::move(turtleTriple)); + response = createOkResponse("INSERT operation processed", request, + ad_utility::MediaType::textPlain); + } else { + _deltaTriples.deleteTriple(std::move(turtleTriple)); + response = createOkResponse("DELETE operation processed", request, + ad_utility::MediaType::textPlain); + } + } + } + // Ping with or without messsage. if (urlPathAndParameters._path == "/ping") { if (auto msg = checkParameter("msg", std::nullopt)) { @@ -461,6 +510,8 @@ json Server::composeStatsJson() const { result["num-text-records"] = _index.getNofTextRecords(); result["num-word-occurrences"] = _index.getNofWordPostings(); result["num-entity-occurrences"] = _index.getNofEntityPostings(); + result["num-delta-triples-inserted"] = _deltaTriples.numInserted(); + result["num-delta-triples-deleted"] = _deltaTriples.numDeleted(); return result; } diff --git a/src/engine/Server.h b/src/engine/Server.h index 41275504c2..4ef4aaccf1 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -12,6 +12,7 @@ #include "engine/QueryExecutionContext.h" #include "engine/QueryExecutionTree.h" #include "engine/SortPerformanceEstimator.h" +#include "index/DeltaTriples.h" #include "index/Index.h" #include "nlohmann/json.hpp" #include "parser/ParseException.h" @@ -58,6 +59,7 @@ class Server { ad_utility::AllocatorWithLimit _allocator; SortPerformanceEstimator _sortPerformanceEstimator; Index _index; + DeltaTriples _deltaTriples; Engine _engine; bool _initialized; diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 28acd40364..e935088ff4 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(index VocabularyOnDisk.h VocabularyOnDisk.cpp IndexMetaData.h IndexMetaDataImpl.h MetaDataHandler.h + DeltaTriples.h DeltaTriples.cpp StxxlSortFunctors.h TextMetaData.cpp TextMetaData.h DocsDB.cpp DocsDB.h diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 86ba5931af..1db29e37bb 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -304,6 +304,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 6d73100e89..6249b118d1 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -79,6 +79,9 @@ constexpr size_t QUEUE_SIZE_BEFORE_PARALLEL_PARSING = 10; // time constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; -// The uncompressed size in bytes of a block of the permutations. Currently 8MB -// is chosen which is well suited for zstd compression -constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; +// The uncompressed size in bytes of a block of the permutations. Currently 8MB +// is chosen which is well suited for zstd compression +// +// NOTE: For playing around with `DeltaTriples`, I am setting this to a +// deliberately small number. +constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 3 * 16; // 1ul << 23u; diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index bd454c0470..99fa2f7445 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -86,7 +86,10 @@ class IndexMetaData { // name and the variable name are terrible. // For each relation, its meta data. + public: MapType _data; + + private: // For each compressed block, its meta data. BlocksType _blockData; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 20abd82e99..66788dc736 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -104,6 +104,8 @@ addLinkAndDiscoverTest(IndexMetaDataTest index) # TODO fix this addLinkAndDiscoverTestSerial(IndexTest index) +addLinkAndDiscoverTestSerial(DeltaTriplesTest index) + addLinkAndDiscoverTest(FTSAlgorithmsTest index) addLinkAndDiscoverTest(EngineTest engine) @@ -305,4 +307,3 @@ addLinkAndDiscoverTestSerial(AggregateExpressionTest parser sparqlExpressions in addLinkAndDiscoverTest(OnDestructionDontThrowDuringStackUnwindingTest) addLinkAndDiscoverTest(ExceptionHandlingTest) - From 4bfc02e7001db1444fe589e9226edaf90974804d Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 17 Mar 2023 10:17:34 +0100 Subject: [PATCH 02/20] New version with a first proper test There is now a test that checks for all existing triples whether the found location is correct (by checking `id2` and `id3` at the found position in the found block, note that the block does not have an explicity `id1` for a given position). The `findTripleInPermutation` method is still (very) inefficient in that it goes through the complete relation metadata in order to find the sequence of `id1`s relevant for a block. This will be fixed in the next commit. Note: the previous commit lacked the new files `DeltaTriples.h`, `DeltaTriples.cpp`, and `DeltaTriplesTest.cpp`. --- src/index/CompressedRelation.cpp | 7 +- src/index/CompressedRelation.h | 11 ++ src/index/DeltaTriples.cpp | 221 +++++++++++++++++++++++++++++++ src/index/DeltaTriples.h | 166 +++++++++++++++++++++++ src/index/MetaDataHandler.h | 79 ++++++++--- test/DeltaTriplesTest.cpp | 174 ++++++++++++++++++++++++ test/IndexMetaDataTest.cpp | 10 +- 7 files changed, 643 insertions(+), 25 deletions(-) create mode 100644 src/index/DeltaTriples.cpp create mode 100644 src/index/DeltaTriples.h create mode 100644 test/DeltaTriplesTest.cpp diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index dab726bbc7..a3f8e4d79a 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -334,12 +334,13 @@ void CompressedRelationWriter::addRelation(Id col0Id, multC2}; auto sizeOfRelation = col1And2Ids.numRows() * col1And2Ids.numColumns() * sizeof(Id); + auto sizeOfBuffer = _buffer.numRows() * _buffer.numColumns() * sizeof(Id); // If this is a large relation, or the currrently buffered relations + // this relation are too large, we will write the buffered relations to file // and start a new block. if (sizeOfRelation > _numBytesPerBlock * 8 / 10 || - sizeOfRelation + _buffer.numRows() > 1.5 * _numBytesPerBlock) { + sizeOfRelation + sizeOfBuffer > 1.5 * _numBytesPerBlock) { writeBufferedRelationsToSingleBlock(); } @@ -358,6 +359,7 @@ void CompressedRelationWriter::addRelation(Id col0Id, } _currentBlockData._col0LastId = col0Id; _currentBlockData._col1LastId = col1And2Ids(col1And2Ids.numRows() - 1, 0); + _currentBlockData._col2LastId = col1And2Ids(col1And2Ids.numRows() - 1, 1); AD_CORRECTNESS_CHECK(_buffer.numColumns() == col1And2Ids.numColumns()); auto bufferOldSize = _buffer.numRows(); _buffer.resize(_buffer.numRows() + col1And2Ids.numRows()); @@ -387,7 +389,8 @@ void CompressedRelationWriter::writeRelationToExclusiveBlocks( _blockBuffer.push_back(CompressedBlockMetadata{ std::move(offsets), actualNumRowsPerBlock, col0Id, col0Id, data[i][0], - data[i + actualNumRowsPerBlock - 1][0]}); + data[i + actualNumRowsPerBlock - 1][0], + data[i + actualNumRowsPerBlock - 1][1]}); } } diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 1db29e37bb..85e656ddbd 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -65,6 +65,16 @@ struct CompressedBlockMetadata { Id _col1FirstId; Id _col1LastId; + // For `DeltaTriples::findTripleInPermutation`, it helps to know the least + // significant ID of the last triple as well. + // + // NOTE: We don't need that information for the first triple of the block and + // as a matter of fact, we don't really need `_col0FirstId` and `_col1FirstId` + // above either. It doesn't really harm though because the total size of the + // blocks is small (even for Wikidata, we have only 50K block, and as you can + // see from the members, a block consumes < 100 bytes). + Id _col2LastId; + // Two of these are equal if all members are equal. bool operator==(const CompressedBlockMetadata&) const = default; }; @@ -83,6 +93,7 @@ AD_SERIALIZE_FUNCTION(CompressedBlockMetadata) { serializer | arg._col0LastId; serializer | arg._col1FirstId; serializer | arg._col1LastId; + serializer | arg._col2LastId; } // The metadata of a whole compressed "relation", where relation refers to a diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp new file mode 100644 index 0000000000..6a763788a5 --- /dev/null +++ b/src/index/DeltaTriples.cpp @@ -0,0 +1,221 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/DeltaTriples.h" + +#include "absl/strings/str_cat.h" +#include "engine/ExportQueryExecutionTrees.h" +#include "index/Index.h" +#include "index/IndexImpl.h" +#include "parser/TurtleParser.h" +#include "util/Timer.h" + +// ____________________________________________________________________________ +void DeltaTriples::clear() { + triplesInserted_.clear(); + triplesDeleted_.clear(); +} + +// ____________________________________________________________________________ +void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { + IdTriple idTriple = getIdTriple(std::move(turtleTriple)); + triplesInserted_.insert(idTriple); +} + +// ____________________________________________________________________________ +void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { + IdTriple idTriple = getIdTriple(std::move(turtleTriple)); + triplesDeleted_.insert(idTriple); +} + +// ____________________________________________________________________________ +DeltaTriples::IdTriple DeltaTriples::getIdTriple(TurtleTriple turtleTriple) { + TripleComponent subject = std::move(turtleTriple._subject); + TripleComponent predicate = std::move(turtleTriple._predicate); + TripleComponent object = std::move(turtleTriple._object); + Id subjectId = std::move(subject).toValueId(index_.getVocab(), localVocab_); + Id predId = std::move(predicate).toValueId(index_.getVocab(), localVocab_); + Id objectId = std::move(object).toValueId(index_.getVocab(), localVocab_); + return IdTriple{subjectId, predId, objectId}; +} + +// ____________________________________________________________________________ +void DeltaTriples::findTripleInAllPermutations(const IdTriple& idTriple, + bool visualize) { + auto [s, p, o] = idTriple; + psoFindTripleResults_.emplace_back( + findTripleInPermutation(p, s, o, index_.getImpl().PSO(), visualize)); + posFindTripleResults_.emplace_back( + findTripleInPermutation(p, o, s, index_.getImpl().POS(), visualize)); + spoFindTripleResults_.emplace_back( + findTripleInPermutation(s, p, o, index_.getImpl().SPO(), visualize)); + sopFindTripleResults_.emplace_back( + findTripleInPermutation(s, o, p, index_.getImpl().SOP(), visualize)); + ospFindTripleResults_.emplace_back( + findTripleInPermutation(o, s, p, index_.getImpl().OSP(), visualize)); + opsFindTripleResults_.emplace_back( + findTripleInPermutation(o, p, s, index_.getImpl().OPS(), visualize)); +} + +// ____________________________________________________________________________ +template +DeltaTriples::FindTripleResult DeltaTriples::findTripleInPermutation( + Id id1, Id id2, Id id3, Permutation& permutation, bool visualize) const { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + ad_utility::SharedConcurrentTimeoutTimer timer; + ad_utility::AllocatorWithLimit unlimitedAllocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + + // Get the name of the permutation and names for the IDs from the triple + // (for visualization only, can eventually be deleted). + auto& pname = permutation._readableName; + std::string name1 = getNameForId(id1); + std::string name2 = getNameForId(id2); + std::string name3 = getNameForId(id3); + std::string tname = absl::StrCat(std::string{pname[0]}, "=", name1, " ", + std::string{pname[1]}, "=", name2, " ", + std::string{pname[2]}, "=", name3); + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: With `_col2LastId` added to `CompressedBlockMetadata`, this can now + // be computed without having to decompress any blocks at this point. See the + // first revision of this branch for code, where blocks with equal `id1` and + // `id2` were decompressed to also check for `id3`. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block._col0LastId < triple[0]) { + return true; + } else if (block._col0LastId == triple[0]) { + if (block._col1LastId < triple[1]) { + return true; + } else if (block._col1LastId == triple[1]) { + return block._col2LastId < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // If all IDs from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and any position in the block (in + // the code that uses the result, that position will not be used in this + // case). + if (matchingBlock == blocks.end()) { + if (visualize) { + std::cout << "All triples in " << pname << " are smaller than " << tname + << std::endl; + } + return FindTripleResult{blocks.size(), std::numeric_limits::max(), + id1, id2, id3}; + } + + // Read and decompress the block. Note that we are potentially doing this a + // second time here (the block has probably already been looked at in the call + // to `std::lower_bound` above). + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Get the most significant IDs for this block (might only be one or several, + // stored implicitly in the metadata). + // + // TODO: This is inefficient and not necessary. However, the current interface + // of `IndexMetaData` doesn't make it easy to get the most significant IDs for + // a block. + size_t blockSize = blockTuples.numRows(); + std::vector mostSignificantIdsInBlock(blockSize); + std::vector mostSignificantIdsDistinct; + for (auto it = meta._data.begin(); it != meta._data.end(); ++it) { + const auto& relationMetadata = meta.getMetaData(it.getId()); + Id id = relationMetadata._col0Id; + uint64_t offset = relationMetadata._offsetInBlock; + size_t numRows = relationMetadata._numRows; + if (offset == std::numeric_limits::max()) { + offset = 0; + } + if (id >= matchingBlock->_col0FirstId && id <= matchingBlock->_col0LastId) { + mostSignificantIdsDistinct.push_back(id); + for (size_t i = 0; i < numRows && offset + i < blockSize; ++i) { + mostSignificantIdsInBlock[offset + i] = id; + } + } + } + std::sort(mostSignificantIdsDistinct.begin(), + mostSignificantIdsDistinct.end()); + + // Find the first triple that is not smaller. If the triple is contained in + // the block that will be the position of the triple. Otherwise it will be the + // position of the first triple that is larger. Since the last triple of this + // block is not smaller, this will not be larger than the last valid index in + // the block. + + // First check whether `id1` occurs at all in this block. If not, the index we + // are searching is just the position of the first ID that is larger. + // Otherwise, we can do binary search in the portion of the block with that + // ID as most significant ID. + size_t rowIndexInBlock = std::numeric_limits::max(); + auto mostSignificantIdsMatch = + std::lower_bound(mostSignificantIdsDistinct.begin(), + mostSignificantIdsDistinct.end(), id1); + if (*mostSignificantIdsMatch > id1) { + rowIndexInBlock = meta.getMetaData(*mostSignificantIdsMatch)._offsetInBlock; + if (rowIndexInBlock == std::numeric_limits::max()) { + rowIndexInBlock = 0; + } + } else { + AD_CORRECTNESS_CHECK(*mostSignificantIdsMatch == id1); + size_t offsetBegin = meta.getMetaData(id1)._offsetInBlock; + size_t offsetEnd = offsetBegin + meta.getMetaData(id1)._numRows; + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + } + AD_CORRECTNESS_CHECK(rowIndexInBlock != std::numeric_limits::max()); + + // Show the respective block. + if (visualize) { + std::cout << std::endl; + std::cout << "Block #" << blockIndex << " from " << pname << " (" << tname + << "):" << std::endl; + // Now we are ready to write the triples in the block, including the most + // significant ID. + for (size_t i = 0; i < blockSize; ++i) { + std::cout << "Row #" << i << ": " + << getNameForId(mostSignificantIdsInBlock[i]); + for (size_t j = 0; j < blockTuples.numColumns(); ++j) { + std::cout << " " << getNameForId(blockTuples(i, j)); + } + if (i == rowIndexInBlock) { + std::cout << " <--"; + } + std::cout << std::endl; + } + } + + return FindTripleResult{blockIndex, rowIndexInBlock, id1, id2, id3}; +} + +// ____________________________________________________________________________ +std::string DeltaTriples::getNameForId(Id id) const { + auto lookupResult = + ExportQueryExecutionTrees::idToStringAndType(index_, id, localVocab_); + AD_CONTRACT_CHECK(lookupResult.has_value()); + const auto& [value, type] = lookupResult.value(); + return type ? absl::StrCat("\"", value, "\"^^<", type, ">") : value; +}; diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h new file mode 100644 index 0000000000..1036d10c57 --- /dev/null +++ b/src/index/DeltaTriples.h @@ -0,0 +1,166 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +#include "engine/LocalVocab.h" +#include "global/Id.h" +#include "index/Index.h" +#include "index/IndexBuilderTypes.h" +#include "parser/TurtleParser.h" +#include "util/HashSet.h" + +// A class for maintaining triples that were inserted or deleted after index +// building. +// +// HOW IT WORKS: +// +// 1. For each "delta triple", find the "matching position" (block index and +// index within that block, see below for a precise definition) for each index +// permutation. +// +// 2. For each permutation and each block, store a sorted list of the positions +// of the delta triples within that block. +// +// 3. In the call of `CompressedRelation::scan`, for each block use the +// information from 2. to check whether there are delta triples for that block +// and if yes, merge them with the triples from the block (given that the +// positions are sorted, this can be done with negligible overhead). +// +// NOTE: For now, this only works when the results of index scans are not +// cached (at least not when there are relevant delta triples for that scan). +// There are two ways how this can play out in the future: +// +// Either we generally do not cache the results of index scans anymore. This +// would have various advantages, in particular, joining with something like +// `rdf:type` would then be possible without storing the whole relation in +// RAM. However, we need a faster decompression then and maybe a smaller block +// size (currently 8 MB). +// +// Or we add the delta triples when iterating over the cached (uncompressed) +// result from the index scan. In that case, we would need to (in Step 1 above) +// store and maintain the positions in those uncompressed index scans. +// +// POSITION WHERE A TRIPLE "FITS" IN A PERMUTATION: +// +// 1. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then take +// the first position in that next block. +// +// 4. Two possibilities remain: +// 4a. The triple is smaller than the first triple of the first block: then take +// that block and the first position in that block. +// 4b. The triple is larger than the last triple of the last block: then take +// that block and the last position in that block. +// +// NOTE: For now, this is a proof-of-concept implementation and the class is +// simplistic in many ways (TODO: make a list, in which ways exactly). +class DeltaTriples { + public: + // Inside this class, we are working with triples of IDs. + using IdTriple = std::array; + + // Hash value for such triple. + template + friend H AbslHashValue(H h, const IdTriple& triple) { + return H::combine(std::move(h), triple[0], triple[1], triple[2]); + } + + // Result record returned by `findTripleInPermutation`, containing: the + // reference to the permutation, the index of the matching block, the index of + // a position within that block, and the search triple in the right + // permutation (for example, for SPO, `id1` is the subject, `id2` is the + // predicate, and `id3` is the object). + // template + struct FindTripleResult { + // const Permutation& permutation; + size_t blockIndex; + size_t rowIndexInBlock; + Id id1; + Id id2; + Id id3; + }; + + // Data structures with position for a particular table (just an example, this + // is not the final data structure). + using Position = std::pair; + using Positions = ad_utility::HashMap>; + + private: + // The index to which these triples are added. + const Index& index_; + + // The sets of triples added to and subtracted from the original index + // + // NOTE: The methods for adding and subtracting should make sure that only + // triples are added that are not already contained in the original index and + // that only triples are subtracted that are contained in the original index. + // In particular, no triple can be in both of these sets. + ad_utility::HashSet triplesInserted_; + ad_utility::HashSet triplesDeleted_; + + // The local vocabulary of these triples. + LocalVocab localVocab_; + + public: + // The positions of the delta triples in each of the six permutations. + // + // TODO: Do the positions need to know to which permutation they belong? + std::vector posFindTripleResults_; + std::vector psoFindTripleResults_; + std::vector sopFindTripleResults_; + std::vector spoFindTripleResults_; + std::vector opsFindTripleResults_; + std::vector ospFindTripleResults_; + + public: + // Construct for given index. + DeltaTriples(const Index& index) : index_(index) {} + + // Clear `_triplesAdded` and `_triplesSubtracted` and all associated data + // structures. + void clear(); + + // The number of delta triples added and subtracted. + size_t numInserted() const { return triplesInserted_.size(); } + size_t numDeleted() const { return triplesDeleted_.size(); } + + // Insert triple. + void insertTriple(TurtleTriple turtleTriple); + + // Delete triple. + void deleteTriple(TurtleTriple turtleTriple); + + // TODO: made public as long as we are trying to figure out how this works. + private: + public: + // Get triples of `Id`s from `TurtleTriple` (which is the kind of triple we + // get from `TurtleParser`, see the code currently handling insertions and + // deletions in `Server.cpp`). + // + // NOTE: This is not `const` because translating to IDs may augment the local + // vocabulary. + IdTriple getIdTriple(TurtleTriple turtleTriple); + + // Find the position of the given triple in the given permutation (a pair of + // block index and index within that block; see the documentation of the class + // above for how exactly that position is defined in all cases). + void findTripleInAllPermutations(const IdTriple& idTriple, + bool visualize = false); + + // The implementation of the above function. + template + FindTripleResult findTripleInPermutation( + Id id1, Id id2, Id id3, Permutation& permutation, bool visualize) const; + + // Resolve ID to name (useful for debugging and testing). + std::string getNameForId(Id id) const; +}; diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index bbf545e095..fe9dc070ab 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -1,24 +1,33 @@ -// Copyright 2018, University of Freiburg, +// Copyright 2018 - 2023, University of Freiburg // Chair of Algorithms and Data Structures -// Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) -// +// Authors: Johannes Kalmbach +// Hannah Bast + #pragma once #include #include -#include "../global/Id.h" -#include "../util/Exception.h" -#include "../util/HashMap.h" -#include "../util/Iterators.h" -#include "../util/Log.h" -#include "../util/Serializer/Serializer.h" -#include "./CompressedRelation.h" - -// _____________________________________________________________________ +#include "global/Id.h" +#include "index/CompressedRelation.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/Iterators.h" +#include "util/Log.h" +#include "util/Serializer/Serializer.h" + +// Class for access to relation metadata stored in a vector. Specifically, our +// index uses this with `M = MmapVector>`; see +// `index/IndexMetaData.h` template class MetaDataWrapperDense { + private: + // A vector of metadata objects. + M _vec; + public: + // An iterator with an additional method `getId()` that gives the relation ID + // of the current metadata object. template struct AddGetIdIterator : BaseIterator { using BaseIterator::BaseIterator; @@ -39,6 +48,7 @@ class MetaDataWrapperDense { // The underlying array is sorted, so all iterators are ordered iterators using ConstOrderedIterator = ConstIterator; + // The type of the stored metadata objects. using value_type = typename M::value_type; // _________________________________________________________ @@ -120,7 +130,9 @@ class MetaDataWrapperDense { // ___________________________________________________________ std::string getFilename() const { return _vec.getFilename(); } - private: + // The following used to be private (because they were only used as + // subroutines in the above), but we now need them in + // `DeltaTriples::findTripleResult`. ConstIterator lower_bound(Id id) const { auto cmp = [](const auto& metaData, Id id) { return metaData._col0Id < id; @@ -133,13 +145,24 @@ class MetaDataWrapperDense { }; return std::lower_bound(_vec.begin(), _vec.end(), id, cmp); } - M _vec; }; -// _____________________________________________________________________ +// Class for access to relation metadata stored in a hash map. Specifically, our +// index uses this with `M = HashMap>`; see +// `index/IndexMetaData.h` template class MetaDataWrapperHashMap { + private: + // The map that maps each existing relation ID to its metadata object. + hashMap _map; + + // The relation IDs in sorted order. This is only computed by the `serialize` + // function defined by `AD_SERIALIZE_FRIEND_FUNCTION` below. + std::vector _sortedKeys; + public: + // An iterator with an additional method `getId()` that gives the relation ID + // of the current metadata object. template struct AddGetIdIterator : public BaseIterator { using BaseIterator::BaseIterator; @@ -237,6 +260,7 @@ class MetaDataWrapperHashMap { return _map.count(id); } + // Defines a friend method `serialize`, see the macro definition for details. AD_SERIALIZE_FRIEND_FUNCTION(MetaDataWrapperHashMap) { serializer | arg._map; if constexpr (ad_utility::serialization::ReadSerializer) { @@ -252,7 +276,26 @@ class MetaDataWrapperHashMap { const auto& sortedKeys() const { return _sortedKeys; } - private: - hashMap _map; - std::vector _sortedKeys; + // The following are needed in `DeltaTriples::findTripleResult`, so that we + // have a common interface for all permutations. The functionality is as + // follows: + // + // If a metadata object with the given `id` exists, return it (or rather, an + // iterator to the corresponding key-value pair in the hash map). If it is + // not contained, return `end()`. + // + // This makes sense because this class is only used for storing the metadata + // of the POS and PSO permutations. If we search an ID of a predicate that + // does not exist in the index, it will get an ID from the local vocab, which + // is larger than all existing IDs. + // + // TODO: This is hacky and should be made less hacky. + // + // (Note that this is different for OPS and OSP, because objects can also be + // values and values that did not previously exist in the index get an + // ID that can be properly compared with existing IDs. But that works find + // because we store the metadata for OPS and OSP, as well as for SOP and SPO, + // using a vector, see the class `MetaDataWrapperDense` above.) + ConstIterator lower_bound(Id id) const { return _map.find(id); } + Iterator lower_bound(Id id) { return _map.find(id); } }; diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp new file mode 100644 index 0000000000..16c308bd63 --- /dev/null +++ b/test/DeltaTriplesTest.cpp @@ -0,0 +1,174 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./IndexTestHelpers.h" +#include "absl/strings/str_split.h" +#include "index/DeltaTriples.h" +#include "index/IndexImpl.h" +#include "parser/TurtleParser.h" + +// Fixture that sets up a test index. +class DeltaTriplesTest : public ::testing::Test { + protected: + // The triples in our test index (as a `std::vector` so that we have easy + // access to each triple separately in the tests below). + static constexpr const char* testTurtle = + " 2 . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " . " + " "; + + // Query execution context with index for testing, see `IndexTestHelpers.h`. + QueryExecutionContext* testQec = ad_utility::testing::getQec(testTurtle); + + // The individual triples (useful for testing below). + // + // NOTE: It looks like it would make more send to define a static class member + // of type `std::vector` in the first place that contains the + // individual triples and then concatenate them with `absl::StrJoin` for + // `getQec`, but C++ doesn't allow non-literal static class members. + std::vector testTriples() { + return absl::StrSplit(testTurtle, " . "); + } + + // Make `TurtleTriple` from given Turtle input. + TurtleTriple makeTurtleTriple(std::string turtle) { + TurtleStringParser parser; + parser.parseUtf8String(std::move(turtle)); + AD_CONTRACT_CHECK(parser.getTriples().size() == 1); + return parser.getTriples()[0]; + } + + // Make `IdTriple` from given Turtle input (the first argument is not `const` + // because we might change the local vocabulary). + DeltaTriples::IdTriple makeIdTriple(DeltaTriples& deltaTriples, + std::string turtle) { + return deltaTriples.getIdTriple(makeTurtleTriple(std::move(turtle))); + } + + // Get the permutation with the given `enum` name. + // ... +}; + +// Test the constructor. +TEST_F(DeltaTriplesTest, constructor) { + DeltaTriples deltaTriples(testQec->getIndex()); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 0); +} + +// Test clear after inserting or deleting a few triples. +TEST_F(DeltaTriplesTest, clear) { + // Insert then clear. + DeltaTriples deltaTriples(testQec->getIndex()); + deltaTriples.insertTriple(makeTurtleTriple(" ")); + ASSERT_EQ(deltaTriples.numInserted(), 1); + ASSERT_EQ(deltaTriples.numDeleted(), 0); + deltaTriples.clear(); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 0); + + // Delete then clear. + deltaTriples.deleteTriple(makeTurtleTriple(" ")); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numDeleted(), 1); + deltaTriples.clear(); + ASSERT_EQ(deltaTriples.numInserted(), 0); + ASSERT_EQ(deltaTriples.numInserted(), 0); +} + +// Check that `findTripleInAllPermutations` locates triples correctly in all +// cases (triples that exist in the index, as well as those that do not). +TEST_F(DeltaTriplesTest, findTripleInAllPermutations) { + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + + // Check the given `FindTripleResult` for the given permutation. + auto checkFindTripleResult = + [&](const DeltaTriples::FindTripleResult& findTripleResult, + const auto& permutation) { + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + const size_t& blockIndex = findTripleResult.blockIndex; + const size_t& rowIndexInBlock = findTripleResult.rowIndexInBlock; + + auto& pname = permutation._readableName; + std::string name1 = deltaTriples.getNameForId(findTripleResult.id1); + std::string name2 = deltaTriples.getNameForId(findTripleResult.id2); + std::string name3 = deltaTriples.getNameForId(findTripleResult.id3); + std::string tname = absl::StrCat(std::string{pname[0]}, "=", name1, " ", + std::string{pname[1]}, "=", name2, " ", + std::string{pname[2]}, "=", name3); + std::string msg = + absl::StrCat("Permutation ", pname, ", triple ", tname, "\n"); + + const vector& blocks = meta.blockData(); + ASSERT_LT(blockIndex, blocks.size()) << msg; + const auto& block = blocks.at(blockIndex); + const auto& blockTuples = + reader.readAndDecompressBlock(block, file, std::nullopt); + ASSERT_LT(rowIndexInBlock, blockTuples.size()) << msg; + ASSERT_EQ(blockTuples(rowIndexInBlock, 0), findTripleResult.id2) << msg; + ASSERT_EQ(blockTuples(rowIndexInBlock, 1), findTripleResult.id3) << msg; + }; + + // Check if each existing triple is located correctly in every permutation. + size_t numTriples = 0; + for (std::string_view triple : testTriples()) { + DeltaTriples::IdTriple idTriple = + makeIdTriple(deltaTriples, std::string{triple}); + deltaTriples.findTripleInAllPermutations(idTriple); + ++numTriples; + ASSERT_EQ(deltaTriples.posFindTripleResults_.size(), numTriples); + checkFindTripleResult(deltaTriples.posFindTripleResults_.back(), + index.getImpl().POS()); + checkFindTripleResult(deltaTriples.psoFindTripleResults_.back(), + index.getImpl().PSO()); + checkFindTripleResult(deltaTriples.spoFindTripleResults_.back(), + index.getImpl().SPO()); + checkFindTripleResult(deltaTriples.sopFindTripleResults_.back(), + index.getImpl().SOP()); + checkFindTripleResult(deltaTriples.opsFindTripleResults_.back(), + index.getImpl().OPS()); + checkFindTripleResult(deltaTriples.ospFindTripleResults_.back(), + index.getImpl().OSP()); + } +} + +// Visualize the result of `findTripleInPermutation` for one particular triple +// by showing the whole block (for understanding and debugging only, this will +// eventually be deleted). +TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { + DeltaTriples deltaTriples(testQec->getIndex()); + std::string tripleAsString = " "; + // std::string tripleAsString = " "; + // std::string tripleAsString = " "; + // std::string tripleAsString = " <0> "; + // std::string tripleAsString = " "; + std::cout << std::endl; + std::cout << "Searching the following triple: " << tripleAsString + << std::endl; + std::cout + << "For each permutation, find the first element that is not smaller" + << std::endl; + + // Search the triple in all permutations. + DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); + deltaTriples.findTripleInAllPermutations(idTriple, true); + std::cout << std::endl; +} diff --git a/test/IndexMetaDataTest.cpp b/test/IndexMetaDataTest.cpp index 5c5b5af654..f1af1e04e3 100644 --- a/test/IndexMetaDataTest.cpp +++ b/test/IndexMetaDataTest.cpp @@ -17,7 +17,7 @@ auto V = ad_utility::testing::VocabId; TEST(RelationMetaDataTest, writeReadTest) { CompressedBlockMetadata rmdB{ - {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24)}; + {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24), V(62)}; CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; ad_utility::serialization::FileWriteSerializer f("_testtmp.rmd"); @@ -39,9 +39,9 @@ TEST(RelationMetaDataTest, writeReadTest) { TEST(IndexMetaDataTest, writeReadTest2Hmap) { vector bs; bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24), V(62)}); bs.push_back(CompressedBlockMetadata{ - {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24)}); + {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24), V(62)}); CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10}; IndexMetaDataHmap imd; @@ -71,9 +71,9 @@ TEST(IndexMetaDataTest, writeReadTest2Mmap) { std::string mmapFilename = imdFilename + ".mmap"; vector bs; bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24), V(62)}); bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24), V(62)}); CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10}; // The index MetaData does not have an explicit clear, so we From aec63702f47c1a9c9096286953da42f9aada6256 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 18 Mar 2023 21:23:52 +0100 Subject: [PATCH 03/20] Triple location code is now correct and well tested 1. Refactored `DeltaTriple::locateTripleInAllPermutations` (the central method of this class). 2. Wrote a test that checks all triple that are contained in the index as well as a slightly modified version of each triple that is not in the index. The test checks that the triple has been located at the exact right position in all permutations. (This is harder than it seems because a lot of things can go wrong + we do not have the relation `Id`s for the blocks explicitly, but only implicitly via the relation metadata.) 3. The method `locateTripleInAllPermutations` now inserts the results into proper data structures that can then be used conveniently in an index scan (writing that code is the next step, but it should be relatively straightforward now). --- .gitignore | 6 +- src/index/DeltaTriples.cpp | 231 ++++++++++++++++-------- src/index/DeltaTriples.h | 89 ++++++--- src/index/MetaDataHandler.h | 9 +- test/DeltaTriplesTest.cpp | 351 ++++++++++++++++++++++++++++++------ test/IndexTestHelpers.h | 9 +- 6 files changed, 536 insertions(+), 159 deletions(-) diff --git a/.gitignore b/.gitignore index d3ca2d604e..e29be6343d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,11 @@ -# Build directory +# Build directories build/ +debug/ cmake-build* +# Debugger history file +.gdb_history + # End-to-End data e2e_data/* # Compiled Object files diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 6a763788a5..e51bcd5185 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -15,6 +15,12 @@ void DeltaTriples::clear() { triplesInserted_.clear(); triplesDeleted_.clear(); + triplesWithPositionsPerBlockInPSO_.clear(); + triplesWithPositionsPerBlockInPOS_.clear(); + triplesWithPositionsPerBlockInSPO_.clear(); + triplesWithPositionsPerBlockInSOP_.clear(); + triplesWithPositionsPerBlockInOSP_.clear(); + triplesWithPositionsPerBlockInOPS_.clear(); } // ____________________________________________________________________________ @@ -29,6 +35,28 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { triplesDeleted_.insert(idTriple); } +// ____________________________________________________________________________ +const DeltaTriples::TriplesWithPositionsPerBlock& +DeltaTriples::getTriplesWithPositionsPerBlock( + Index::Permutation permutation) const { + switch (permutation) { + case Index::Permutation::PSO: + return triplesWithPositionsPerBlockInPSO_; + case Index::Permutation::POS: + return triplesWithPositionsPerBlockInPOS_; + case Index::Permutation::SPO: + return triplesWithPositionsPerBlockInSPO_; + case Index::Permutation::SOP: + return triplesWithPositionsPerBlockInSOP_; + case Index::Permutation::OSP: + return triplesWithPositionsPerBlockInOSP_; + case Index::Permutation::OPS: + return triplesWithPositionsPerBlockInOPS_; + default: + AD_FAIL(); + } +} + // ____________________________________________________________________________ DeltaTriples::IdTriple DeltaTriples::getIdTriple(TurtleTriple turtleTriple) { TripleComponent subject = std::move(turtleTriple._subject); @@ -41,26 +69,43 @@ DeltaTriples::IdTriple DeltaTriples::getIdTriple(TurtleTriple turtleTriple) { } // ____________________________________________________________________________ -void DeltaTriples::findTripleInAllPermutations(const IdTriple& idTriple, - bool visualize) { +void DeltaTriples::locateTripleInAllPermutations(const IdTriple& idTriple, + bool visualize) { + // Helper lambda for adding `tripleWithPosition` to given + // `TriplesWithPositionsPerBlock` list. + auto addTripleWithPosition = + [&](const TripleWithPosition& tripleWithPosition, + TriplesWithPositionsPerBlock& triplesWithPositionsPerBlock) { + triplesWithPositionsPerBlock.positionMap_[tripleWithPosition.blockIndex] + .emplace_back(tripleWithPosition); + }; + + // Now locate the triple in each permutation and add it to the correct + // `TriplesWithPositionsPerBlock` list. auto [s, p, o] = idTriple; - psoFindTripleResults_.emplace_back( - findTripleInPermutation(p, s, o, index_.getImpl().PSO(), visualize)); - posFindTripleResults_.emplace_back( - findTripleInPermutation(p, o, s, index_.getImpl().POS(), visualize)); - spoFindTripleResults_.emplace_back( - findTripleInPermutation(s, p, o, index_.getImpl().SPO(), visualize)); - sopFindTripleResults_.emplace_back( - findTripleInPermutation(s, o, p, index_.getImpl().SOP(), visualize)); - ospFindTripleResults_.emplace_back( - findTripleInPermutation(o, s, p, index_.getImpl().OSP(), visualize)); - opsFindTripleResults_.emplace_back( - findTripleInPermutation(o, p, s, index_.getImpl().OPS(), visualize)); + addTripleWithPosition( + locateTripleInPermutation(p, s, o, index_.getImpl().PSO(), visualize), + triplesWithPositionsPerBlockInPSO_); + addTripleWithPosition( + locateTripleInPermutation(p, o, s, index_.getImpl().POS(), visualize), + triplesWithPositionsPerBlockInPOS_); + addTripleWithPosition( + locateTripleInPermutation(s, p, o, index_.getImpl().SPO(), visualize), + triplesWithPositionsPerBlockInSPO_); + addTripleWithPosition( + locateTripleInPermutation(s, o, p, index_.getImpl().SOP(), visualize), + triplesWithPositionsPerBlockInSOP_); + addTripleWithPosition( + locateTripleInPermutation(o, s, p, index_.getImpl().OSP(), visualize), + triplesWithPositionsPerBlockInOSP_); + addTripleWithPosition( + locateTripleInPermutation(o, p, s, index_.getImpl().OPS(), visualize), + triplesWithPositionsPerBlockInOPS_); } // ____________________________________________________________________________ template -DeltaTriples::FindTripleResult DeltaTriples::findTripleInPermutation( +DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( Id id1, Id id2, Id id3, Permutation& permutation, bool visualize) const { // Get the internal data structures from the permutation. auto& file = permutation._file; @@ -104,18 +149,30 @@ DeltaTriples::FindTripleResult DeltaTriples::findTripleInPermutation( }); size_t blockIndex = matchingBlock - blocks.begin(); + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // IDs, but still an invalid `rowIndexInBlock` and `existsInIndex` set to + // `false`. + TripleWithPosition tripleWithPosition{ + blockIndex, std::numeric_limits::max(), id1, id2, id3, false}; + // If all IDs from all blocks are smaller, we return the index of the last // block plus one (typical "end" semantics) and any position in the block (in // the code that uses the result, that position will not be used in this // case). if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); if (visualize) { + std::cout << endl; std::cout << "All triples in " << pname << " are smaller than " << tname << std::endl; } - return FindTripleResult{blocks.size(), std::numeric_limits::max(), - id1, id2, id3}; + return tripleWithPosition; } + auto showTriple = [](const std::string& prefix, + const std::vector& triple) { + std::cout << prefix << triple[0] << " " << triple[1] << " " << triple[2] + << std::endl; + }; // Read and decompress the block. Note that we are potentially doing this a // second time here (the block has probably already been looked at in the call @@ -123,61 +180,64 @@ DeltaTriples::FindTripleResult DeltaTriples::findTripleInPermutation( DecompressedBlock blockTuples = reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); - // Get the most significant IDs for this block (might only be one or several, - // stored implicitly in the metadata). + if (0) { + std::vector ourTriple = {id1, id2, id3}; + std::vector lastTripleInBlock = {matchingBlock->_col0LastId, + matchingBlock->_col1LastId, + matchingBlock->_col2LastId}; + std::vector trueLastTripleInBlock = { + Id::makeUndefined(), blockTuples.back()[0], blockTuples.back()[1]}; + std::cout << std::endl; + showTriple("Ours: ", ourTriple); + showTriple("Last: ", lastTripleInBlock); + showTriple("True: ", trueLastTripleInBlock); + AD_CORRECTNESS_CHECK(ourTriple <= lastTripleInBlock); + } + + // Find the smallest "relation" ID that is not smaller than `id1` and get its + // metadata and the position of the first and last triple with that ID in the + // block. // - // TODO: This is inefficient and not necessary. However, the current interface - // of `IndexMetaData` doesn't make it easy to get the most significant IDs for - // a block. - size_t blockSize = blockTuples.numRows(); - std::vector mostSignificantIdsInBlock(blockSize); - std::vector mostSignificantIdsDistinct; - for (auto it = meta._data.begin(); it != meta._data.end(); ++it) { - const auto& relationMetadata = meta.getMetaData(it.getId()); - Id id = relationMetadata._col0Id; - uint64_t offset = relationMetadata._offsetInBlock; - size_t numRows = relationMetadata._numRows; - if (offset == std::numeric_limits::max()) { - offset = 0; - } - if (id >= matchingBlock->_col0FirstId && id <= matchingBlock->_col0LastId) { - mostSignificantIdsDistinct.push_back(id); - for (size_t i = 0; i < numRows && offset + i < blockSize; ++i) { - mostSignificantIdsInBlock[offset + i] = id; - } - } + // IMPORTANT FIX: If relation `id1` exists in the index, but our triple is + // larger than all triples of that relation in the index and the last triple + // of that relation ends a block, then our block search above (correctly) + // landed us at the next block. We can detect this by checking whether the + // first relation ID of the block is larger than `id1` and then we should + // get the metadata for the ID and not for `id1` (which would pertain to a + // previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, which + // is relevant in the rare case where a triple is inserted with an `Id` for + // predicate that is not a new `Id`, but has not been used for a predicate in + // the original index. + // + // NOTE: Since we have already handled the case, where all IDs in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->_col0FirstId > id1 ? matchingBlock->_col0FirstId : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata._offsetInBlock; + size_t offsetEnd = offsetBegin + relationMetadata._numRows; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); } - std::sort(mostSignificantIdsDistinct.begin(), - mostSignificantIdsDistinct.end()); - - // Find the first triple that is not smaller. If the triple is contained in - // the block that will be the position of the triple. Otherwise it will be the - // position of the first triple that is larger. Since the last triple of this - // block is not smaller, this will not be larger than the last valid index in - // the block. - - // First check whether `id1` occurs at all in this block. If not, the index we - // are searching is just the position of the first ID that is larger. - // Otherwise, we can do binary search in the portion of the block with that - // ID as most significant ID. - size_t rowIndexInBlock = std::numeric_limits::max(); - auto mostSignificantIdsMatch = - std::lower_bound(mostSignificantIdsDistinct.begin(), - mostSignificantIdsDistinct.end(), id1); - if (*mostSignificantIdsMatch > id1) { - rowIndexInBlock = meta.getMetaData(*mostSignificantIdsMatch)._offsetInBlock; - if (rowIndexInBlock == std::numeric_limits::max()) { - rowIndexInBlock = 0; - } - } else { - AD_CORRECTNESS_CHECK(*mostSignificantIdsMatch == id1); - size_t offsetBegin = meta.getMetaData(id1)._offsetInBlock; - size_t offsetEnd = offsetBegin + meta.getMetaData(id1)._numRows; - if (offsetBegin == std::numeric_limits::max()) { - offsetBegin = 0; - offsetEnd = blockTuples.size(); - } - rowIndexInBlock = + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where we + // are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger ID and the position of the first triple + // of that relation is exactly the position we are looking for. + if (id == id1) { + tripleWithPosition.rowIndexInBlock = std::lower_bound(blockTuples.begin() + offsetBegin, blockTuples.begin() + offsetEnd, std::array{id2, id3}, @@ -185,30 +245,44 @@ DeltaTriples::FindTripleResult DeltaTriples::findTripleInPermutation( return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); }) - blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. Note + // that our default for `existsInIndex` was set to `false` above. + const size_t& i = tripleWithPosition.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + tripleWithPosition.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + tripleWithPosition.rowIndexInBlock = offsetBegin; } - AD_CORRECTNESS_CHECK(rowIndexInBlock != std::numeric_limits::max()); - // Show the respective block. + // Show the respective block. Note that we can show the relation ID only for + // a part of the block (maybe the whole block, but not always). if (visualize) { std::cout << std::endl; std::cout << "Block #" << blockIndex << " from " << pname << " (" << tname << "):" << std::endl; // Now we are ready to write the triples in the block, including the most // significant ID. - for (size_t i = 0; i < blockSize; ++i) { + for (size_t i = 0; i < blockTuples.numRows(); ++i) { std::cout << "Row #" << i << ": " - << getNameForId(mostSignificantIdsInBlock[i]); + << (i >= offsetBegin && i < offsetEnd ? getNameForId(id) + : std::string{"*"}); for (size_t j = 0; j < blockTuples.numColumns(); ++j) { std::cout << " " << getNameForId(blockTuples(i, j)); } - if (i == rowIndexInBlock) { - std::cout << " <--"; + if (i == tripleWithPosition.rowIndexInBlock) { + std::cout << " <-- " + << (tripleWithPosition.existsInIndex ? "existing triple" + : "new triple"); } std::cout << std::endl; } } - return FindTripleResult{blockIndex, rowIndexInBlock, id1, id2, id3}; + // Return the result. + return tripleWithPosition; } // ____________________________________________________________________________ @@ -217,5 +291,8 @@ std::string DeltaTriples::getNameForId(Id id) const { ExportQueryExecutionTrees::idToStringAndType(index_, id, localVocab_); AD_CONTRACT_CHECK(lookupResult.has_value()); const auto& [value, type] = lookupResult.value(); + // std::ostringstream os; + // os << "[" << id << "]"; return type ? absl::StrCat("\"", value, "\"^^<", type, ">") : value; + // : absl::StrCat(value, " ", os.str()); }; diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 1036d10c57..4fa85f15d5 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -74,25 +74,64 @@ class DeltaTriples { return H::combine(std::move(h), triple[0], triple[1], triple[2]); } - // Result record returned by `findTripleInPermutation`, containing: the - // reference to the permutation, the index of the matching block, the index of - // a position within that block, and the search triple in the right - // permutation (for example, for SPO, `id1` is the subject, `id2` is the - // predicate, and `id3` is the object). - // template - struct FindTripleResult { - // const Permutation& permutation; + // Result record returned by `locateTripleInPermutation`. + // + // NOTE: This is currently more information then we need. In particular, the + // `blockIndex` is already implicit in `TriplesWithPositionsPerBlock` and the + // bit `existsInOriginalIndex_` can be derived using the information stored in + // a block and our metadata. However, both are useful for testing and for a + // small nuber of delta triples (think millions), the space efficiency of this + // class is not a significant issue. + struct TripleWithPosition { + // The index of the block and the position within that block, where the + // triple "fits". size_t blockIndex; size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. Id id1; Id id2; Id id3; + // Whether the triple exists in the original index or is new. + bool existsInIndex; }; - // Data structures with position for a particular table (just an example, this - // is not the final data structure). - using Position = std::pair; - using Positions = ad_utility::HashMap>; + // Data structures with positions for a particular permutation. + class TriplesWithPositionsPerBlock { + private: + // A position contains information about the index within the block and the + // triple to be inserted (all three `Id`s in the order of the permutation, + // including the most significant `Id`). + using TriplesWithPositions = std::vector; + + public: + // Map from block index to position list. + // + // TODO: Keep the position list for each block index sorted (primary key: + // row index in block, secondary key: triple order). + // + // TODO: Should be private, but we want to iterate over it for testing. + ad_utility::HashMap positionMap_; + + public: + // Get the positions for a given block index. Returns an empty list if there + // are no positions for that block index. + // + // TODO: Check if that is the behavior we want when actually using class + // `DeltaTriples` to augment the result of an index scan. + TriplesWithPositions getTriplesWithPositionsForBlock(size_t blockIndex) { + auto it = positionMap_.find(blockIndex); + if (it != positionMap_.end()) { + return it->second; + } else { + return std::vector{}; + } + } + + // Empty the data structure. + void clear() { positionMap_.clear(); } + }; private: // The index to which these triples are added. @@ -110,16 +149,15 @@ class DeltaTriples { // The local vocabulary of these triples. LocalVocab localVocab_; - public: // The positions of the delta triples in each of the six permutations. // // TODO: Do the positions need to know to which permutation they belong? - std::vector posFindTripleResults_; - std::vector psoFindTripleResults_; - std::vector sopFindTripleResults_; - std::vector spoFindTripleResults_; - std::vector opsFindTripleResults_; - std::vector ospFindTripleResults_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPSO_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPOS_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInSPO_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInSOP_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInOSP_; + TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInOPS_; public: // Construct for given index. @@ -139,6 +177,10 @@ class DeltaTriples { // Delete triple. void deleteTriple(TurtleTriple turtleTriple); + // Get positions for given permutation. + const TriplesWithPositionsPerBlock& getTriplesWithPositionsPerBlock( + Index::Permutation permutation) const; + // TODO: made public as long as we are trying to figure out how this works. private: public: @@ -153,13 +195,14 @@ class DeltaTriples { // Find the position of the given triple in the given permutation (a pair of // block index and index within that block; see the documentation of the class // above for how exactly that position is defined in all cases). - void findTripleInAllPermutations(const IdTriple& idTriple, - bool visualize = false); + void locateTripleInAllPermutations(const IdTriple& idTriple, + bool visualize = false); // The implementation of the above function. template - FindTripleResult findTripleInPermutation( - Id id1, Id id2, Id id3, Permutation& permutation, bool visualize) const; + TripleWithPosition locateTripleInPermutation(Id id1, Id id2, Id id3, + Permutation& permutation, + bool visualize) const; // Resolve ID to name (useful for debugging and testing). std::string getNameForId(Id id) const; diff --git a/src/index/MetaDataHandler.h b/src/index/MetaDataHandler.h index fe9dc070ab..16162c8d5e 100644 --- a/src/index/MetaDataHandler.h +++ b/src/index/MetaDataHandler.h @@ -33,6 +33,7 @@ class MetaDataWrapperDense { using BaseIterator::BaseIterator; AddGetIdIterator(BaseIterator base) : BaseIterator{base} {} [[nodiscard]] Id getId() const { return getIdFromElement(*(*this)); } + [[nodiscard]] const auto& getMetaData() const { return *(*this); } static Id getIdFromElement(const typename BaseIterator::value_type& v) { return v._col0Id; } @@ -168,6 +169,7 @@ class MetaDataWrapperHashMap { using BaseIterator::BaseIterator; AddGetIdIterator(BaseIterator base) : BaseIterator{base} {} [[nodiscard]] Id getId() const { return (*this)->second._col0Id; } + [[nodiscard]] const auto& getMetaData() const { return (*this)->second; } static Id getIdFromElement(const typename BaseIterator::value_type& v) { return v.second._col0Id; } @@ -289,7 +291,12 @@ class MetaDataWrapperHashMap { // does not exist in the index, it will get an ID from the local vocab, which // is larger than all existing IDs. // - // TODO: This is hacky and should be made less hacky. + // TODO: This is not quite correct. We might insert a triple where the + // predicate has an ID that exists in the original index, but it just hasn't + // been used for a predicate in the original index. Then `end()` is not the + // right answer when searching for that triple, but we indeed need the sorted + // sequence of IDs (solution: just use an ordered `std::map` instead of an + // unordered `HashMap`). // // (Note that this is different for OPS and OSP, because objects can also be // values and values that did not previously exist in the index get an diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 16c308bd63..d050dff5ba 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -10,13 +10,17 @@ #include "index/IndexImpl.h" #include "parser/TurtleParser.h" +// Shortcuts to these full type names used frequently in the following. +// using DeltaTriples::IdTriple; +// using DeltaTriples::TriplesWithPositionsPerBlock; + // Fixture that sets up a test index. class DeltaTriplesTest : public ::testing::Test { protected: // The triples in our test index (as a `std::vector` so that we have easy // access to each triple separately in the tests below). static constexpr const char* testTurtle = - " 2 . " + " . " " . " " . " " . " @@ -24,10 +28,10 @@ class DeltaTriplesTest : public ::testing::Test { " . " " . " " . " - " . " - " . " " . " " . " + " . " + " . " " . " " "; @@ -59,10 +63,123 @@ class DeltaTriplesTest : public ::testing::Test { return deltaTriples.getIdTriple(makeTurtleTriple(std::move(turtle))); } - // Get the permutation with the given `enum` name. - // ... + // Get the complete sequence of "relation" (most significant) `Id`s for the + // given permutation. The result is a `std::vector` of `std::vector`, + // where the index into the outer vector is a block index, and each inner + // vector is as large as the corresponding block. + // + // NOTE: To save index storage space, these `Id`s are not stored explicitly in + // the blocks, but implicitly in the `CompressedRelationMetadata` objects of a + // permutation. For our test of `locateTripleInAllPermutations` below, we need + // random access to these `Id`s. + template + std::vector> getAllRelationIdsForPermutation( + const Permutation& permutation) { + // The metadata for each block (since our blocks are large, this is not a + // lot of data). + const std::vector& metadataPerBlock = + permutation._meta.blockData(); + + // Make room for the `Id`s in our final result: one `std::vector`` per + // block, and each of these is as large as the respective block. + std::vector> result(metadataPerBlock.size()); + for (size_t i = 0; i < result.size(); ++i) { + result[i].resize(metadataPerBlock[i]._numRows, Id::makeUndefined()); + } + + // Iterate over all relations. + // + // NOTE: The metadata per "relation" is stored as a hash map for POS and PSO + // (where there are typically few distinct "relations", that is, + // predicates), and as a vector for the other four permutations (there are + // typically many distinct subjects and objects). Whatever the type, we can + // always iterate over the complete set, see `MetaDataHandler.h`. + const auto& metadataPerRelation = permutation._meta._data; + for (auto it = metadataPerRelation.begin(); it != metadataPerRelation.end(); + ++it) { + // Get the `Id` of this relation, and where it starts in its (at this + // unknown) block, and how many triples it has overall. + const CompressedRelationMetadata& relationMetadata = it.getMetaData(); + Id relationId = relationMetadata._col0Id; + size_t offsetInBlock = relationMetadata._offsetInBlock; + size_t numTriples = relationMetadata._numRows; + + // Find the index of the first block that contains triples from this + // relation. + const auto block = std::lower_bound( + metadataPerBlock.begin(), metadataPerBlock.end(), relationId, + [&](const CompressedBlockMetadata& block, const Id& id) -> bool { + return block._col0LastId < id; + }); + size_t blockIndex = block - metadataPerBlock.begin(); + AD_CORRECTNESS_CHECK(blockIndex < metadataPerBlock.size()); + AD_CORRECTNESS_CHECK(block->_col0FirstId <= relationId); + AD_CORRECTNESS_CHECK(block->_col0LastId >= relationId); + + // If the relation fits into a single block, we need to write the relation + // `Id` only in one block of our result. Otherwise, we have a sequence of + // blocks for only that relation `Id`. + if (offsetInBlock != std::numeric_limits::max()) { + AD_CORRECTNESS_CHECK(offsetInBlock + numTriples <= block->_numRows); + for (size_t i = offsetInBlock; i < offsetInBlock + numTriples; ++i) { + result[blockIndex][i] = relationId; + } + } else { + size_t count = 0; + while (blockIndex < metadataPerBlock.size() && + metadataPerBlock[blockIndex]._col0FirstId == relationId) { + const auto& block = metadataPerBlock[blockIndex]; + AD_CORRECTNESS_CHECK(block._col0LastId == relationId); + for (size_t i = 0; i < block._numRows; ++i) { + result[blockIndex][i] = relationId; + } + ++blockIndex; + count += block._numRows; + } + AD_CORRECTNESS_CHECK(count == numTriples); + } + } + + // Check that all slots in `result` have been written and then return it. + for (const auto& resultBlock : result) { + for (const Id& id : resultBlock) { + AD_CORRECTNESS_CHECK(id != Id::makeUndefined()); + } + } + return result; + } }; +// Print relation `Id`s for selected permutation (for debugging only). +TEST_F(DeltaTriplesTest, showAllRelationIdsForPermutation) { + bool runThisTest = false; + if (runThisTest) { + // Compute relation `Id`s for POS (choose another premutation if you wish). + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + const auto& permutation = index.getImpl().POS(); + const std::vector> allRelationIdsForPermutation = + getAllRelationIdsForPermutation(permutation); + + // Show them per block. + std::cout << endl; + std::cout << "All relation IDs for permutation " + << permutation._readableName << ":" << std::endl; + size_t blockCount = 0; + for (const auto& block : allRelationIdsForPermutation) { + std::cout << "Block #" << (++blockCount) << ":"; + for (const Id& id : block) { + std::cout << " " + << (id != Id::makeUndefined() ? deltaTriples.getNameForId(id) + : "UNDEF") + << std::flush; + } + std::cout << std::endl; + } + std::cout << std::endl; + } +} + // Test the constructor. TEST_F(DeltaTriplesTest, constructor) { DeltaTriples deltaTriples(testQec->getIndex()); @@ -90,72 +207,200 @@ TEST_F(DeltaTriplesTest, clear) { ASSERT_EQ(deltaTriples.numInserted(), 0); } -// Check that `findTripleInAllPermutations` locates triples correctly in all -// cases (triples that exist in the index, as well as those that do not). +// Check that `locateTripleInAllPermutations` locates triples correctly in +// all cases (triples that exist in the index, as well as those that do +// not). TEST_F(DeltaTriplesTest, findTripleInAllPermutations) { const Index& index = testQec->getIndex(); DeltaTriples deltaTriples(index); - // Check the given `FindTripleResult` for the given permutation. - auto checkFindTripleResult = - [&](const DeltaTriples::FindTripleResult& findTripleResult, - const auto& permutation) { + // Check the given `tripleWithPosition` (a block index, an index in the + // block, and a triple) is correct for the given permutation as follows: + // + // 1. If `tripleWithPosition.existsInIndex == true`, check that the + // triple indeed occurs at that position in the respective triple. + // + // 2. If `tripleWithPosition.existsInIndex == false`, check that the + // triple at the position is larger and the triple at the previous + // position is smaller. + auto checkTripleWithPositionInPermutation = + [&](const DeltaTriples::TripleWithPosition& tripleWithPosition, + const auto& permutation, + const std::vector>& relationIdsPerBlock) { + // Shortcuts for the tiples ids and its position. + const size_t blockIndex = tripleWithPosition.blockIndex; + const size_t rowIndexInBlock = tripleWithPosition.rowIndexInBlock; + const bool existsInIndex = tripleWithPosition.existsInIndex; + const DeltaTriples::IdTriple deltaTriple{tripleWithPosition.id1, + tripleWithPosition.id2, + tripleWithPosition.id3}; + + // Members for accessing the data of a permutation. auto& file = permutation._file; const auto& meta = permutation._meta; const auto& reader = permutation._reader; - const size_t& blockIndex = findTripleResult.blockIndex; - const size_t& rowIndexInBlock = findTripleResult.rowIndexInBlock; - - auto& pname = permutation._readableName; - std::string name1 = deltaTriples.getNameForId(findTripleResult.id1); - std::string name2 = deltaTriples.getNameForId(findTripleResult.id2); - std::string name3 = deltaTriples.getNameForId(findTripleResult.id3); - std::string tname = absl::StrCat(std::string{pname[0]}, "=", name1, " ", - std::string{pname[1]}, "=", name2, " ", - std::string{pname[2]}, "=", name3); + // Prepare a message for when one of our assertions fails. In + // particular, provide the name of the permutation and the triple in + // nice human-readable form. + auto& namePermutation = permutation._readableName; + std::string nameId1 = deltaTriples.getNameForId(deltaTriple[0]); + std::string nameId2 = deltaTriples.getNameForId(deltaTriple[1]); + std::string nameId3 = deltaTriples.getNameForId(deltaTriple[2]); + std::string nameTriple = + absl::StrCat(std::string{namePermutation[0]}, "=", nameId1, " ", + std::string{namePermutation[1]}, "=", nameId2, " ", + std::string{namePermutation[2]}, "=", nameId3); std::string msg = - absl::StrCat("Permutation ", pname, ", triple ", tname, "\n"); + absl::StrCat("Permutation ", namePermutation, ", triple ", + nameTriple, ", block index ", blockIndex, + ", row index in block ", rowIndexInBlock, "\n"); + + // If the `blockIndex` is beyond the last index, check the following: + // + // 1. The delta triple does not exist in the index + // 2. The delta triple is larger than all triples in the index + // 3. Exit this test (there is nothing more to test in that case) + const vector& metadataPerBlock = + meta.blockData(); + AD_CONTRACT_CHECK(metadataPerBlock.size() > 0); + DeltaTriples::IdTriple lastTriple{metadataPerBlock.back()._col0LastId, + metadataPerBlock.back()._col1LastId, + metadataPerBlock.back()._col2LastId}; + if (blockIndex >= metadataPerBlock.size()) { + ASSERT_EQ(blockIndex, metadataPerBlock.size()) << msg; + ASSERT_FALSE(existsInIndex); + ASSERT_GT(deltaTriple, lastTriple); + return; + } - const vector& blocks = meta.blockData(); - ASSERT_LT(blockIndex, blocks.size()) << msg; - const auto& block = blocks.at(blockIndex); + // Read the triple at the block position and at the previous position + // (which might be in the previous block). + // + // TODO: We assume here that `Id::makeUndefined()` is strictly smaller + // than any regular `Id`. Is that correct? + // + // NOTE: When `blockIndex` is valid (we have handled the other case + // already above), `rowIndexInBlock` should always be a valid index into + // the block (and never one too large); check the semantics of + // `locateTripleInAllPermutations`. + const auto& blockMetadata = metadataPerBlock.at(blockIndex); const auto& blockTuples = - reader.readAndDecompressBlock(block, file, std::nullopt); + reader.readAndDecompressBlock(blockMetadata, file, std::nullopt); ASSERT_LT(rowIndexInBlock, blockTuples.size()) << msg; - ASSERT_EQ(blockTuples(rowIndexInBlock, 0), findTripleResult.id2) << msg; - ASSERT_EQ(blockTuples(rowIndexInBlock, 1), findTripleResult.id3) << msg; + DeltaTriples::IdTriple blockTriple{ + relationIdsPerBlock[blockIndex][rowIndexInBlock], + blockTuples(rowIndexInBlock, 0), blockTuples(rowIndexInBlock, 1)}; + auto blockTriplePrevious = [&]() -> DeltaTriples::IdTriple { + if (rowIndexInBlock > 0) { + return DeltaTriples::IdTriple{ + relationIdsPerBlock[blockIndex][rowIndexInBlock - 1], + blockTuples(rowIndexInBlock - 1, 0), + blockTuples(rowIndexInBlock - 1, 1)}; + } else if (blockIndex > 0) { + return DeltaTriples::IdTriple{ + metadataPerBlock[blockIndex - 1]._col0LastId, + metadataPerBlock[blockIndex - 1]._col1LastId, + metadataPerBlock[blockIndex - 1]._col2LastId}; + } else { + return DeltaTriples::IdTriple{ + Id::makeUndefined(), Id::makeUndefined(), Id::makeUndefined()}; + } + }(); + + // Now we can check whether our delta triple is exactly at the right + // location. + if (existsInIndex) { + ASSERT_EQ(blockTriple, deltaTriple) << msg; + ASSERT_LT(blockTriplePrevious, deltaTriple) << msg; + } else { + ASSERT_GT(blockTriple, deltaTriple) << msg; + ASSERT_LT(blockTriplePrevious, deltaTriple) << msg; + } + }; + + // Check that all `tripleWithPosition` in `positionsPerBlock` are + // correct for the given permutation. + auto checkAllTriplesWithPositionsForPermutation = + [&](const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock, + const auto& permutation) { + std::vector> allRelationIdsForPermutation = + getAllRelationIdsForPermutation(permutation); + for (const auto& [blockIndex, triplesWithPositions] : + triplesWithPositionsPerBlock.positionMap_) { + for (const auto& tripleWithPosition : triplesWithPositions) { + checkTripleWithPositionInPermutation( + tripleWithPosition, permutation, allRelationIdsForPermutation); + } + } }; - // Check if each existing triple is located correctly in every permutation. - size_t numTriples = 0; + // Check that all `tripleWithPosition`s are correct (for all + // permutations). the given permutation. + auto checkAllTriplesWithPositionForAllPermutations = [&](const DeltaTriples& + deltaTriples) { + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::POS), + index.getImpl().POS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::PSO), + index.getImpl().PSO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::SPO), + index.getImpl().SPO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::SOP), + index.getImpl().SOP()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::OPS), + index.getImpl().OPS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::OSP), + index.getImpl().OSP()); + }; + + // Check if each existing triple is located correctly in every + // permutation. + // + // TODO: Check that `existsInIndex` was set correctly. Test test routine + // above just take it from the tested `TripleWithPosition` objects + // (which might be wrong) + // + // TODO: Check that each triple that was located was indeed added to + // each of the `TriplesWithPositionsPerBlock` objects. + // + // TODO: Eventually, we should test `insertTriple` and `deleteTriple`, + // which only insert a triple when it doesn't exist in the original + // index, and which only delete a triple, when it does exist in the + // original index. But let's first get `locateTripleInAllPermutations` + // correct. Note that to check whether a triple exists or not in the + // original index, looking at one permutation suffices. + [[maybe_unused]] size_t numTriples = 0; for (std::string_view triple : testTriples()) { DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, std::string{triple}); - deltaTriples.findTripleInAllPermutations(idTriple); - ++numTriples; - ASSERT_EQ(deltaTriples.posFindTripleResults_.size(), numTriples); - checkFindTripleResult(deltaTriples.posFindTripleResults_.back(), - index.getImpl().POS()); - checkFindTripleResult(deltaTriples.psoFindTripleResults_.back(), - index.getImpl().PSO()); - checkFindTripleResult(deltaTriples.spoFindTripleResults_.back(), - index.getImpl().SPO()); - checkFindTripleResult(deltaTriples.sopFindTripleResults_.back(), - index.getImpl().SOP()); - checkFindTripleResult(deltaTriples.opsFindTripleResults_.back(), - index.getImpl().OPS()); - checkFindTripleResult(deltaTriples.ospFindTripleResults_.back(), - index.getImpl().OSP()); + deltaTriples.locateTripleInAllPermutations(idTriple); } + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Check that new triples are located correctly in every permutation. + for (std::string_view triple : testTriples()) { + std::string newTriple{triple}; + newTriple[1] = 'X'; + DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, newTriple); + deltaTriples.locateTripleInAllPermutations(idTriple); + } + checkAllTriplesWithPositionForAllPermutations(deltaTriples); } -// Visualize the result of `findTripleInPermutation` for one particular triple -// by showing the whole block (for understanding and debugging only, this will -// eventually be deleted). +// Visualize the result of `findTripleInPermutation` for one particular +// triple by showing the whole block (for understanding and debugging +// only, this will eventually be deleted). TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { DeltaTriples deltaTriples(testQec->getIndex()); - std::string tripleAsString = " "; + std::string tripleAsString = " "; + // std::string tripleAsString = " "; // std::string tripleAsString = " "; // std::string tripleAsString = " "; // std::string tripleAsString = " <0> "; @@ -163,12 +408,12 @@ TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { std::cout << std::endl; std::cout << "Searching the following triple: " << tripleAsString << std::endl; - std::cout - << "For each permutation, find the first element that is not smaller" - << std::endl; + std::cout << "For each permutation, find the first element that is not " + "smaller" + << std::endl; // Search the triple in all permutations. DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); - deltaTriples.findTripleInAllPermutations(idTriple, true); + deltaTriples.locateTripleInAllPermutations(idTriple, true); std::cout << std::endl; } diff --git a/test/IndexTestHelpers.h b/test/IndexTestHelpers.h index 72a86d64ee..5ae4638861 100644 --- a/test/IndexTestHelpers.h +++ b/test/IndexTestHelpers.h @@ -34,7 +34,8 @@ inline Index makeIndexWithTestSettings() { // when the files were not deleted after the test). inline std::vector getAllIndexFilenames( const std::string indexBasename) { - return {indexBasename + ".index.pos", + return {indexBasename + ".ttl", + indexBasename + ".index.pos", indexBasename + ".index.pso", indexBasename + ".index.sop", indexBasename + ".index.sop.meta", @@ -64,7 +65,7 @@ inline Index makeTestIndex(const std::string& indexBasename, // these tests. static std::ostringstream ignoreLogStream; ad_utility::setGlobalLoggingStream(&ignoreLogStream); - std::string filename = "relationalExpressionTestIndex.ttl"; + std::string inputFilename = indexBasename + ".ttl"; if (turtleInput.empty()) { turtleInput = " ")); + deltaTriples.insertTriple(makeTurtleTriple(" ")); ASSERT_EQ(deltaTriples.numInserted(), 1); ASSERT_EQ(deltaTriples.numDeleted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 1); deltaTriples.clear(); ASSERT_EQ(deltaTriples.numInserted(), 0); ASSERT_EQ(deltaTriples.numDeleted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 0); // Delete then clear. - deltaTriples.deleteTriple(makeTurtleTriple(" ")); + deltaTriples.deleteTriple(makeTurtleTriple(" ")); ASSERT_EQ(deltaTriples.numInserted(), 0); ASSERT_EQ(deltaTriples.numDeleted(), 1); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 1); deltaTriples.clear(); ASSERT_EQ(deltaTriples.numInserted(), 0); ASSERT_EQ(deltaTriples.numInserted(), 0); + checkTriplesWithPositionsPerBlockSize(deltaTriples, 0); } -// Check that `locateTripleInAllPermutations` locates triples correctly in -// all cases (triples that exist in the index, as well as those that do -// not). -TEST_F(DeltaTriplesTest, findTripleInAllPermutations) { +// Check that insert and delete work as they should. The core of this test is to +// check that `locateTripleInPermutation` and `locateTripleInAllPermutations` +// work correctly. +// +// TODO: Wouldn't it make more sense to test the mentioned functions instead of +// `insertTriple` and `deleteTriple`? +TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { const Index& index = testQec->getIndex(); DeltaTriples deltaTriples(index); @@ -240,17 +281,10 @@ TEST_F(DeltaTriplesTest, findTripleInAllPermutations) { const auto& meta = permutation._meta; const auto& reader = permutation._reader; - // Prepare a message for when one of our assertions fails. In - // particular, provide the name of the permutation and the triple in - // nice human-readable form. - auto& namePermutation = permutation._readableName; - std::string nameId1 = deltaTriples.getNameForId(deltaTriple[0]); - std::string nameId2 = deltaTriples.getNameForId(deltaTriple[1]); - std::string nameId3 = deltaTriples.getNameForId(deltaTriple[2]); - std::string nameTriple = - absl::StrCat(std::string{namePermutation[0]}, "=", nameId1, " ", - std::string{namePermutation[1]}, "=", nameId2, " ", - std::string{namePermutation[2]}, "=", nameId3); + // Prepare a message for when one of our assertions fails, with nice + // names for the permutation and the `deltaTriple`. + auto [namePermutation, nameTriple] = getNicePermutationAndTripleName( + deltaTriples, permutation, deltaTriple); std::string msg = absl::StrCat("Permutation ", namePermutation, ", triple ", nameTriple, ", block index ", blockIndex, @@ -376,31 +410,54 @@ TEST_F(DeltaTriplesTest, findTripleInAllPermutations) { // original index. But let's first get `locateTripleInAllPermutations` // correct. Note that to check whether a triple exists or not in the // original index, looking at one permutation suffices. - [[maybe_unused]] size_t numTriples = 0; - for (std::string_view triple : testTriples()) { - DeltaTriples::IdTriple idTriple = - makeIdTriple(deltaTriples, std::string{triple}); - deltaTriples.locateTripleInAllPermutations(idTriple); + const std::vector& testTriples = getTestTriples(); + for (std::string_view triple : testTriples) { + deltaTriples.deleteTriple(makeTurtleTriple(triple)); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Inserting the triples a second time should throw an exception (and not + // change anything about the internal data structures). + for (std::string_view triple : testTriples) { + AD_EXPECT_THROW_WITH_MESSAGE( + deltaTriples.deleteTriple(makeTurtleTriple(triple)), + ::testing::HasSubstr("this deletion therefore has no effect")); } + checkTriplesWithPositionsPerBlockSize(deltaTriples, testTriples.size()); checkAllTriplesWithPositionForAllPermutations(deltaTriples); // Check that new triples are located correctly in every permutation. - for (std::string_view triple : testTriples()) { + for (std::string_view triple : testTriples) { + std::string newTriple{triple}; + newTriple[1] = 'X'; + deltaTriples.insertTriple(makeTurtleTriple(newTriple)); + } + checkTriplesWithPositionsPerBlockSize(deltaTriples, 2 * testTriples.size()); + checkAllTriplesWithPositionForAllPermutations(deltaTriples); + + // Deleting the triples a second time should throw an exception (and not + // change anything about the internal data structures). + for (std::string_view triple : testTriples) { std::string newTriple{triple}; newTriple[1] = 'X'; - DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, newTriple); - deltaTriples.locateTripleInAllPermutations(idTriple); + AD_EXPECT_THROW_WITH_MESSAGE( + deltaTriples.insertTriple(makeTurtleTriple(newTriple)), + ::testing::HasSubstr("this insertion therefore has no effect")); } + checkTriplesWithPositionsPerBlockSize(deltaTriples, 2 * testTriples.size()); checkAllTriplesWithPositionForAllPermutations(deltaTriples); + } // Visualize the result of `findTripleInPermutation` for one particular // triple by showing the whole block (for understanding and debugging // only, this will eventually be deleted). TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { - DeltaTriples deltaTriples(testQec->getIndex()); - std::string tripleAsString = " "; - // std::string tripleAsString = " "; + const Index& index = testQec->getIndex(); + DeltaTriples deltaTriples(index); + // std::string tripleAsString = " "; + std::string tripleAsString = " "; // std::string tripleAsString = " "; // std::string tripleAsString = " "; // std::string tripleAsString = " <0> "; @@ -414,6 +471,69 @@ TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { // Search the triple in all permutations. DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); - deltaTriples.locateTripleInAllPermutations(idTriple, true); + auto iterators = deltaTriples.locateTripleInAllPermutations(idTriple); + + // Helper lambda for showing the block from the given permutation that + // contains the given (via an iterator) `TripleWithPosition` object. + auto showBlock = + [&](DeltaTriples::TriplesWithPositions::iterator& tripleWithPosition, + const auto& permutation) { + // Shortcuts for the triple and its position. + // AD_CORRECTNESS_CHECK(tripleWithPosition != tripleWithPosition.end()); + const size_t blockIndex = tripleWithPosition->blockIndex; + const size_t rowIndexInBlock = tripleWithPosition->rowIndexInBlock; + const bool existsInIndex = tripleWithPosition->existsInIndex; + const DeltaTriples::IdTriple deltaTriple{tripleWithPosition->id1, + tripleWithPosition->id2, + tripleWithPosition->id3}; + + // Get nice names for the permutation and the triple. + auto [namePermutation, nameTriple] = getNicePermutationAndTripleName( + deltaTriples, permutation, deltaTriple); + + // If we are beyond the last block, there is nothing to show. + const vector& blockMetas = + permutation._meta.blockData(); + if (blockIndex >= blockMetas.size()) { + std::cout << endl; + std::cout << "All triples in " << namePermutation + << " are smaller than " << nameTriple << std::endl; + return; + } + + // Read the block and compute all relation `Id`s. + const CompressedBlockMetadata& blockMetadata = blockMetas[blockIndex]; + DecompressedBlock blockTuples = + permutation._reader.readAndDecompressBlock( + blockMetadata, permutation._file, std::nullopt); + std::vector blockRelationIds = + getAllRelationIdsForPermutation(permutation).at(blockIndex); + AD_CORRECTNESS_CHECK(blockRelationIds.size() == blockTuples.size()); + + // Show the triples in the block. + std::cout << std::endl; + std::cout << "Block #" << blockIndex << " from " << namePermutation + << " (" << nameTriple << "):" << std::endl; + for (size_t i = 0; i < blockTuples.numRows(); ++i) { + std::cout << "Row #" << i << ": " + << deltaTriples.getNameForId(blockRelationIds[i]); + for (size_t j = 0; j < blockTuples.numColumns(); ++j) { + std::cout << " " << deltaTriples.getNameForId(blockTuples(i, j)); + } + if (i == rowIndexInBlock) { + std::cout << " <-- " + << (existsInIndex ? "existing triple" : "new triple"); + } + std::cout << std::endl; + } + }; + + // Show block for each permutation. + showBlock(iterators.iteratorPOS, index.getImpl().POS()); + showBlock(iterators.iteratorPSO, index.getImpl().PSO()); + showBlock(iterators.iteratorSPO, index.getImpl().SPO()); + showBlock(iterators.iteratorSOP, index.getImpl().SOP()); + showBlock(iterators.iteratorOSP, index.getImpl().OSP()); + showBlock(iterators.iteratorOPS, index.getImpl().OPS()); std::cout << std::endl; } From d2d99e09ae3b404e55ae930c1cea847ca27a0713 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 19 Mar 2023 16:23:55 +0100 Subject: [PATCH 05/20] Make clang-format happy --- src/index/DeltaTriples.cpp | 14 ++++++-------- test/DeltaTriplesTest.cpp | 1 - 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 42f79582b5..2d65177872 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -50,10 +50,9 @@ void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { // // TODO: Test this behavior. if (triplesInserted_.contains(idTriple)) { - throw std::runtime_error( - absl::StrCat("Triple \"", turtleTriple.toString(), - "\" was already inserted before", - ", this insertion therefore has no effect")); + throw std::runtime_error(absl::StrCat( + "Triple \"", turtleTriple.toString(), "\" was already inserted before", + ", this insertion therefore has no effect")); } // When re-inserting a previously deleted triple, we need to remove the triple // from `triplesDeleted_` AND remove it from all @@ -86,10 +85,9 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { // // TODO: Test this behavior. if (triplesDeleted_.contains(idTriple)) { - throw std::runtime_error( - absl::StrCat("Triple \"", turtleTriple.toString(), - "\" was already deleted before", - ", this deletion therefore has no effect")); + throw std::runtime_error(absl::StrCat( + "Triple \"", turtleTriple.toString(), "\" was already deleted before", + ", this deletion therefore has no effect")); } // When deleting a previously inserted triple (that did not exist in the index // before), we need to remove the triple from `triplesInserted_` AND remove it diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index e37de7f5b0..253315940d 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -447,7 +447,6 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { } checkTriplesWithPositionsPerBlockSize(deltaTriples, 2 * testTriples.size()); checkAllTriplesWithPositionForAllPermutations(deltaTriples); - } // Visualize the result of `findTripleInPermutation` for one particular From 4017177d5d5e0f892ee877cf71cf954824231781 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 19 Mar 2023 16:46:34 +0100 Subject: [PATCH 06/20] Try to make the more pedantic native build happy It complained about an assignment in the condition of an `if`. --- src/engine/Server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 92d547ca50..a35660f4d8 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -341,10 +341,10 @@ Awaitable Server::process( bool insertDetected = false; bool deleteDetected = false; std::optional parameterValue; - if (parameterValue = checkParameter("insert", std::nullopt)) { + if ((parameterValue = checkParameter("insert", std::nullopt))) { LOG(INFO) << "INSERT: " << parameterValue.value() << std::endl; insertDetected = true; - } else if (parameterValue = checkParameter("delete", std::nullopt)) { + } else if ((parameterValue = checkParameter("delete", std::nullopt))) { LOG(INFO) << "DELETE: " << parameterValue.value() << std::endl; deleteDetected = true; } From b4a81dfaefa304fd235778b171401ee2aa0357c7 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 21 Mar 2023 13:25:17 +0100 Subject: [PATCH 07/20] The permutations now have access to the delta triples 1. Each permutation now has information about the delta triples per block and where exactly they are located in that permutation 2. As a proof of concept, the LOG already shows the number of delta triples found in the blocks relevant for the scan. --- src/engine/Server.cpp | 13 ++-- src/engine/Server.h | 1 - src/index/CompressedRelation.cpp | 32 ++++++++- src/index/CompressedRelation.h | 11 ++- src/index/ConstantsIndexBuilding.h | 5 +- src/index/DeltaTriples.h | 11 ++- src/index/Index.cpp | 18 ++++- src/index/Index.h | 24 ++++++- src/index/IndexImpl.cpp | 3 +- src/index/IndexImpl.h | 106 ++++++++++++++++++++--------- src/index/Permutations.h | 28 +++++--- 11 files changed, 188 insertions(+), 64 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index a35660f4d8..49f9761a5b 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -35,7 +35,6 @@ Server::Server(const int port, const int numThreads, size_t maxMemGB, }}, _sortPerformanceEstimator(), _index(), - _deltaTriples(_index), _engine(), _initialized(false), // The number of server threads currently also is the number of queries @@ -320,8 +319,8 @@ Awaitable Server::process( response = createJsonResponse(composeCacheStatsJson(), request); } else if (auto cmd = checkParameter("cmd", "clear-delta-triples")) { logCommand(cmd, "clear delta triples"); - _deltaTriples.clear(); - response = createJsonResponse(composeCacheStatsJson(), request); + _index.deltaTriples().clear(); + response = createJsonResponse(composeStatsJson(), request); } else if (auto cmd = checkParameter("cmd", "get-settings")) { logCommand(cmd, "get server settings"); response = createJsonResponse(RuntimeParameters().toMap(), request); @@ -360,13 +359,13 @@ Awaitable Server::process( } TurtleTriple turtleTriple = parser.getTriples()[0]; if (insertDetected) { - _deltaTriples.insertTriple(std::move(turtleTriple)); + _index.deltaTriples().insertTriple(std::move(turtleTriple)); response = createOkResponse(absl::StrCat("INSERT operation for triple \"", input, "\" processed\n"), request, ad_utility::MediaType::textPlain); } else { - _deltaTriples.deleteTriple(std::move(turtleTriple)); + _index.deltaTriples().deleteTriple(std::move(turtleTriple)); response = createOkResponse(absl::StrCat("DELETE operation for triple \"", input, "\" processed\n"), @@ -515,8 +514,8 @@ json Server::composeStatsJson() const { result["num-text-records"] = _index.getNofTextRecords(); result["num-word-occurrences"] = _index.getNofWordPostings(); result["num-entity-occurrences"] = _index.getNofEntityPostings(); - result["num-delta-triples-inserted"] = _deltaTriples.numInserted(); - result["num-delta-triples-deleted"] = _deltaTriples.numDeleted(); + result["num-delta-triples-inserted"] = _index.deltaTriples().numInserted(); + result["num-delta-triples-deleted"] = _index.deltaTriples().numDeleted(); return result; } diff --git a/src/engine/Server.h b/src/engine/Server.h index 4ef4aaccf1..db72ea37e0 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -59,7 +59,6 @@ class Server { ad_utility::AllocatorWithLimit _allocator; SortPerformanceEstimator _sortPerformanceEstimator; Index _index; - DeltaTriples _deltaTriples; Engine _engine; bool _initialized; diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index a03720af10..d75bad8afb 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -18,7 +18,9 @@ void CompressedRelationReader::scan( const CompressedRelationMetadata& metadata, const vector& blockMetadata, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer) const { + ad_utility::SharedConcurrentTimeoutTimer timer, + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == NumColumns); // get all the blocks where _col0FirstId <= col0Id <= _col0LastId @@ -42,6 +44,18 @@ void CompressedRelationReader::scan( return a._col0FirstId < b._col0FirstId && a._col0LastId < b._col0LastId; }); + // PRELIMINARY: Say how many delta triples are contained in those blocks. + size_t numDeltaTriples = 0; + for (auto block = beginBlock; block < endBlock; ++block) { + size_t blockIndex = block - blockMetadata.begin(); + if (triplesWithPositionsPerBlock.positionMap_.contains(blockIndex)) { + numDeltaTriples += + triplesWithPositionsPerBlock.positionMap_.at(blockIndex).size(); + } + } + LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples + << std::endl; + // The total size of the result is now known. result->resize(metadata.getNofElements()); @@ -156,7 +170,9 @@ void CompressedRelationReader::scan( void CompressedRelationReader::scan( const CompressedRelationMetadata& metaData, Id col1Id, const vector& blocks, ad_utility::File& file, - IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer) const { + IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == 1); // Get all the blocks that possibly might contain our pair of col0Id and @@ -184,6 +200,18 @@ void CompressedRelationReader::scan( std::equal_range(blocks.begin(), blocks.end(), KeyLhs{col0Id, col0Id, col1Id, col1Id}, comp); + // PRELIMINARY: Say how many delta triples are contained in those blocks. + size_t numDeltaTriples = 0; + for (auto block = beginBlock; block < endBlock; ++block) { + size_t blockIndex = block - blocks.begin(); + if (triplesWithPositionsPerBlock.positionMap_.contains(blockIndex)) { + numDeltaTriples += + triplesWithPositionsPerBlock.positionMap_.at(blockIndex).size(); + } + } + LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples + << std::endl; + // Invariant: The col0Id is completely stored in a single block, or it is // contained in multiple blocks that only contain this col0Id, bool col0IdHasExclusiveBlocks = diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 85e656ddbd..b2291c8af4 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -11,6 +11,7 @@ #include "engine/idTable/IdTable.h" #include "global/Id.h" #include "index/ConstantsIndexBuilding.h" +#include "index/DeltaTriples.h" #include "util/BufferedVector.h" #include "util/Cache.h" #include "util/ConcurrentCache.h" @@ -262,7 +263,10 @@ class CompressedRelationReader { void scan(const CompressedRelationMetadata& metadata, const vector& blockMetadata, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer) const; + ad_utility::SharedConcurrentTimeoutTimer timer, + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock = + DeltaTriples::TriplesWithPositionsPerBlock{}) const; /** * @brief For a permutation XYZ, retrieve all Z for given X and Y. @@ -282,7 +286,10 @@ class CompressedRelationReader { void scan(const CompressedRelationMetadata& metaData, Id col1Id, const vector& blocks, ad_utility::File& file, IdTable* result, - ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const; + ad_utility::SharedConcurrentTimeoutTimer timer, + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock = + DeltaTriples::TriplesWithPositionsPerBlock{}) const; private: // Read the block that is identified by the `blockMetaData` from the `file`. diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 5a6d8dfed0..40679aa42b 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -84,5 +84,8 @@ constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; // // NOTE: For playing around with `DeltaTriples`, I am setting this to a // deliberately small number. +constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 16; // 64 KB. +// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 17; // 128 KB. +// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 13; // 8 KB. // constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 3 * 16; -constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; +// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 1bb615aac5..88c53e3958 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -5,12 +5,19 @@ #pragma once #include "engine/LocalVocab.h" -#include "global/Id.h" #include "index/Index.h" #include "index/IndexBuilderTypes.h" #include "parser/TurtleParser.h" #include "util/HashSet.h" +// The `DeltaTriples` class needs to know the `Index` to which it belongs (for +// translating the components of a Turtle triple to `Id`s and for locating `Id`s +// in the permutations). However, the `Index` class also needs to know the +// `DeltaTriples` (in order to consider these triples when scanning a +// permutation). To avoid a circular include, and since we only need an `Index&` +// here, we can use a forward declaration. +// class Index; + // A class for maintaining triples that were inserted or deleted after index // building. // @@ -160,8 +167,6 @@ class DeltaTriples { const Index& index_; // The positions of the delta triples in each of the six permutations. - // - // TODO: Do the positions need to know to which permutation they belong? TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPSO_; TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPOS_; TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInSPO_; diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 82c2ab5a0c..d441693590 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -8,8 +8,10 @@ #include "./IndexImpl.h" -// ____________________________________________________________ -Index::Index() : pimpl_{std::make_unique()} {} +// _____________________________________________________________________________ +Index::Index() + : deltaTriples_{std::make_unique(*this)}, + pimpl_{std::make_unique(std::move(deltaTriples_))} {} Index::Index(Index&&) noexcept = default; // Needs to be in the .cpp file because of the unique_ptr to a forwarded class. @@ -17,6 +19,18 @@ Index::Index(Index&&) noexcept = default; // https://stackoverflow.com/questions/13414652/forward-declaration-with-unique-ptr Index::~Index() = default; +// _____________________________________________________________________________ +IndexImpl& Index::getImpl() { return *pimpl_; } +[[nodiscard]] const IndexImpl& Index::getImpl() const { return *pimpl_; } + +// _____________________________________________________________________________ +[[nodiscard]] DeltaTriples& Index::deltaTriples() { + return pimpl_->deltaTriples(); +} +[[nodiscard]] const DeltaTriples& Index::deltaTriples() const { + return pimpl_->deltaTriples(); +} + // ___________________________________________________________ template void Index::createFromFile(const std::string& filename) { diff --git a/src/index/Index.h b/src/index/Index.h index 37cc173b49..04e0bc8e7a 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -20,6 +20,7 @@ class IdTable; class TextBlockMetaData; class IndexImpl; +class DeltaTriples; /** * Used as a template argument to the `createFromFile` method, when we do not @@ -30,6 +31,18 @@ class TurtleParserAuto {}; class Index { private: + // A unique `DeltaTriples` object will be created when this `Index` object is + // constructed and then immediately passed on to the `IndexImpl` (see the next + // member). This order is important because the `DeltaTriples` object needs to + // know the `Index` to which it pertains, and the `IndexImpl` needs access to + // the `DeltaTriples` when scanning permutations. + // + // NOTE: The `Index` and `IndexImpl` class could also share a pointer to the + // same `DeltaTriples` object, but it seems more correct to have a unique + // pointer, which only `IndexImpl` owns. Note how the `deltaTriples` getter + // below accesses the object via `pimpl_`. + std::unique_ptr deltaTriples_; + // Pimpl to reduce compile times. std::unique_ptr pimpl_; @@ -60,6 +73,9 @@ class Index { // Get underlying access to the Pimpl where necessary. const IndexImpl& getPimpl() const { return *pimpl_; } + // Use delta triples (the default is not to use them). + void useDeltaTriples(); + // Create an index from a file. Will write vocabulary and on-disk index data. // NOTE: The index can not directly be used after this call, but has to be // setup by `createFromOnDiskIndex` after this call. @@ -301,6 +317,10 @@ class Index { // Get access to the implementation. This should be used rarely as it // requires including the rather expensive `IndexImpl.h` header - IndexImpl& getImpl() { return *pimpl_; } - [[nodiscard]] const IndexImpl& getImpl() const { return *pimpl_; } + IndexImpl& getImpl(); + [[nodiscard]] const IndexImpl& getImpl() const; + + // Get acces to the delta triples. + [[nodiscard]] DeltaTriples& deltaTriples(); + [[nodiscard]] const DeltaTriples& deltaTriples() const; }; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 6f3d8d55d1..feea741a1c 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -30,7 +30,8 @@ using std::array; // _____________________________________________________________________________ -IndexImpl::IndexImpl() : _usePatterns(false) {} +IndexImpl::IndexImpl(std::unique_ptr deltaTriples) + : deltaTriples_(std::move(deltaTriples)), _usePatterns(false) {} // _____________________________________________________________________________ template diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index fa79cb0cb1..da8bb007b2 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -5,33 +5,6 @@ // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - #include #include #include @@ -42,6 +15,34 @@ #include #include +#include "engine/ResultTable.h" +#include "global/Pattern.h" +#include "index/CompressedRelation.h" +#include "index/ConstantsIndexBuilding.h" +#include "index/DeltaTriples.h" +#include "index/DocsDB.h" +#include "index/Index.h" +#include "index/IndexBuilderTypes.h" +#include "index/IndexMetaData.h" +#include "index/PatternCreator.h" +#include "index/Permutations.h" +#include "index/StxxlSortFunctors.h" +#include "index/TextMetaData.h" +#include "index/Vocabulary.h" +#include "index/VocabularyGenerator.h" +#include "parser/ContextFileParser.h" +#include "parser/TripleComponent.h" +#include "parser/TurtleParser.h" +#include "util/BackgroundStxxlSorter.h" +#include "util/BufferedVector.h" +#include "util/CompressionUsingZstd/ZstdWrapper.h" +#include "util/File.h" +#include "util/Forward.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/Timer.h" +#include "util/json.h" + using ad_utility::BufferedVector; using ad_utility::MmapVector; using ad_utility::MmapVectorView; @@ -132,6 +133,10 @@ class IndexImpl { off_t _currentoff_t; mutable ad_utility::File _textIndexFile; + // Reference to the delta triples from the `Index` class of which this class + // is the implementation. + std::unique_ptr deltaTriples_; + // If false, only PSO and POS permutations are loaded and expected. bool _loadAllPermutations = true; @@ -167,15 +172,46 @@ class IndexImpl { // TODO: make those private and allow only const access // instantiations for the six permutations used in QLever. // They simplify the creation of permutations in the index class. - Permutation::POS_T _POS{SortByPOS(), "POS", ".pos", {1, 2, 0}}; - Permutation::PSO_T _PSO{SortByPSO(), "PSO", ".pso", {1, 0, 2}}; - Permutation::SOP_T _SOP{SortBySOP(), "SOP", ".sop", {0, 2, 1}}; - Permutation::SPO_T _SPO{SortBySPO(), "SPO", ".spo", {0, 1, 2}}; - Permutation::OPS_T _OPS{SortByOPS(), "OPS", ".ops", {2, 1, 0}}; - Permutation::OSP_T _OSP{SortByOSP(), "OSP", ".osp", {2, 0, 1}}; + Permutation::POS_T _POS{ + SortByPOS(), + "POS", + ".pos", + {1, 2, 0}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::POS)}; + Permutation::PSO_T _PSO{ + SortByPSO(), + "PSO", + ".pso", + {1, 0, 2}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::PSO)}; + Permutation::SOP_T _SOP{ + SortBySOP(), + "SOP", + ".sop", + {0, 2, 1}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::SOP)}; + Permutation::SPO_T _SPO{ + SortBySPO(), + "SPO", + ".spo", + {0, 1, 2}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::SPO)}; + Permutation::OPS_T _OPS{ + SortByOPS(), + "OPS", + ".ops", + {2, 1, 0}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::OPS)}; + Permutation::OSP_T _OSP{ + SortByOSP(), + "OSP", + ".osp", + {2, 0, 1}, + deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::OSP)}; public: - IndexImpl(); + IndexImpl(std::unique_ptr deltaTriples = + std::unique_ptr()); /// Forbid copy and assignment. IndexImpl& operator=(const IndexImpl&) = delete; @@ -196,6 +232,8 @@ class IndexImpl { auto& OPS() { return _OPS; } const auto& OSP() const { return _OSP; } auto& OSP() { return _OSP; } + const DeltaTriples& deltaTriples() const { return *deltaTriples_; } + DeltaTriples& deltaTriples() { return *deltaTriples_; } // Creates an index from a file. Parameter Parser must be able to split the // file's format into triples. diff --git a/src/index/Permutations.h b/src/index/Permutations.h index 9699bedb30..b6d7bb422e 100644 --- a/src/index/Permutations.h +++ b/src/index/Permutations.h @@ -1,16 +1,18 @@ // Copyright 2018, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Johannes Kalmbach (johannes.kalmbach@gmail.com) + #pragma once #include #include -#include "../global/Constants.h" -#include "../util/File.h" -#include "../util/Log.h" -#include "./IndexMetaData.h" -#include "./StxxlSortFunctors.h" +#include "global/Constants.h" +#include "index/DeltaTriples.h" +#include "index/IndexMetaData.h" +#include "index/StxxlSortFunctors.h" +#include "util/File.h" +#include "util/Log.h" namespace Permutation { using std::array; @@ -21,11 +23,19 @@ using std::string; // STXXL. template class PermutationImpl { + private: + // The delta triples and their positions in this permutation. + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock_; + public: using MetaData = MetaDataT; PermutationImpl(const Comparator& comp, string name, string suffix, - array order) - : _comp(comp), + array order, + const DeltaTriples::TriplesWithPositionsPerBlock& + triplesWithPositionsPerBlock) + : triplesWithPositionsPerBlock_(triplesWithPositionsPerBlock), + _comp(comp), _readableName(std::move(name)), _fileSuffix(std::move(suffix)), _keyOrder(order) {} @@ -65,7 +75,7 @@ class PermutationImpl { } const auto& metaData = _meta.getMetaData(col0Id); return _reader.scan(metaData, _meta.blockData(), _file, result, - std::move(timer)); + std::move(timer), triplesWithPositionsPerBlock_); } /// For given IDs for the first and second column, retrieve all IDs of the /// third column, and store them in `result`. This is just a thin wrapper @@ -79,7 +89,7 @@ class PermutationImpl { const auto& metaData = _meta.getMetaData(col0Id); return _reader.scan(metaData, col1Id, _meta.blockData(), _file, result, - timer); + timer, triplesWithPositionsPerBlock_); } // _______________________________________________________ From 4d22469e47f6f0604dd67c0b6fcddfbdb0b80b84 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 21 Mar 2023 22:07:19 +0100 Subject: [PATCH 08/20] Better names and own class `LocatedTriple` 1. Improved the names. In particular, `TripleWithPosition` is now called `LocatedTriple` and analogously for the related classes. 2. Put `LocatedTriple` and associated classed in own file `LocatedTriple.h`. Also move a function there that used to be a helper lambda in one of `DeltaTriples` methods. TODO: The `locatedTripleInPermutation` method should also be moved to `LocatedTriple.h` and there should be a `test/LocatedTripleTest` with all the associated tests. --- src/global/IdTriple.h | 14 ++ src/index/CompressedRelation.cpp | 16 +- src/index/CompressedRelation.h | 10 +- src/index/DeltaTriples.cpp | 153 +++++++----------- src/index/DeltaTriples.h | 266 ++++++++++--------------------- src/index/LocatedTriple.h | 97 +++++++++++ src/index/Permutations.h | 12 +- test/DeltaTriplesTest.cpp | 213 +++++++++++++------------ 8 files changed, 378 insertions(+), 403 deletions(-) create mode 100644 src/global/IdTriple.h create mode 100644 src/index/LocatedTriple.h diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h new file mode 100644 index 0000000000..d80e72dee7 --- /dev/null +++ b/src/global/IdTriple.h @@ -0,0 +1,14 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#pragma once + +// Should we have an own class for this? We need this at several places. +using IdTriple = std::array; + +// Hash value for such triple. +template +H AbslHashValue(H h, const IdTriple& triple) { + return H::combine(std::move(h), triple[0], triple[1], triple[2]); +} diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index d75bad8afb..cd2933ab88 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -19,8 +19,7 @@ void CompressedRelationReader::scan( const vector& blockMetadata, ad_utility::File& file, IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock) const { + const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == NumColumns); // get all the blocks where _col0FirstId <= col0Id <= _col0LastId @@ -48,9 +47,8 @@ void CompressedRelationReader::scan( size_t numDeltaTriples = 0; for (auto block = beginBlock; block < endBlock; ++block) { size_t blockIndex = block - blockMetadata.begin(); - if (triplesWithPositionsPerBlock.positionMap_.contains(blockIndex)) { - numDeltaTriples += - triplesWithPositionsPerBlock.positionMap_.at(blockIndex).size(); + if (locatedTriplesPerBlock.map_.contains(blockIndex)) { + numDeltaTriples += locatedTriplesPerBlock.map_.at(blockIndex).size(); } } LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples @@ -171,8 +169,7 @@ void CompressedRelationReader::scan( const CompressedRelationMetadata& metaData, Id col1Id, const vector& blocks, ad_utility::File& file, IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock) const { + const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == 1); // Get all the blocks that possibly might contain our pair of col0Id and @@ -204,9 +201,8 @@ void CompressedRelationReader::scan( size_t numDeltaTriples = 0; for (auto block = beginBlock; block < endBlock; ++block) { size_t blockIndex = block - blocks.begin(); - if (triplesWithPositionsPerBlock.positionMap_.contains(blockIndex)) { - numDeltaTriples += - triplesWithPositionsPerBlock.positionMap_.at(blockIndex).size(); + if (locatedTriplesPerBlock.map_.contains(blockIndex)) { + numDeltaTriples += locatedTriplesPerBlock.map_.at(blockIndex).size(); } } LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index b2291c8af4..b0bf320e04 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -264,9 +264,8 @@ class CompressedRelationReader { const vector& blockMetadata, ad_utility::File& file, IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock = - DeltaTriples::TriplesWithPositionsPerBlock{}) const; + const LocatedTriplesPerBlock& locatedTriplesPerBlock = + LocatedTriplesPerBlock{}) const; /** * @brief For a permutation XYZ, retrieve all Z for given X and Y. @@ -287,9 +286,8 @@ class CompressedRelationReader { const vector& blocks, ad_utility::File& file, IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock = - DeltaTriples::TriplesWithPositionsPerBlock{}) const; + const LocatedTriplesPerBlock& locatedTriplesPerBlock = + LocatedTriplesPerBlock{}) const; private: // Read the block that is identified by the `blockMetaData` from the `file`. diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 2d65177872..ee8dadfe14 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -5,7 +5,6 @@ #include "index/DeltaTriples.h" #include "absl/strings/str_cat.h" -#include "engine/ExportQueryExecutionTrees.h" #include "index/Index.h" #include "index/IndexImpl.h" #include "parser/TurtleParser.h" @@ -15,31 +14,30 @@ void DeltaTriples::clear() { triplesInserted_.clear(); triplesDeleted_.clear(); - triplesWithPositionsPerBlockInPSO_.clear(); - triplesWithPositionsPerBlockInPOS_.clear(); - triplesWithPositionsPerBlockInSPO_.clear(); - triplesWithPositionsPerBlockInSOP_.clear(); - triplesWithPositionsPerBlockInOSP_.clear(); - triplesWithPositionsPerBlockInOPS_.clear(); + locatedTriplesPerBlockInPSO_.clear(); + locatedTriplesPerBlockInPOS_.clear(); + locatedTriplesPerBlockInSPO_.clear(); + locatedTriplesPerBlockInSOP_.clear(); + locatedTriplesPerBlockInOSP_.clear(); + locatedTriplesPerBlockInOPS_.clear(); } // ____________________________________________________________________________ void DeltaTriples::eraseTripleInAllPermutations( - TriplesWithPositionsIterators& iterators) { + DeltaTriples::LocatedTripleHandles& handles) { // Helper lambda for erasing for one particular permutation. - auto erase = [](TriplesWithPositions::iterator tripleWithPosition, - TriplesWithPositionsPerBlock& triplesWithPositionsPerBlock) { - size_t blockIndex = tripleWithPosition->blockIndex; - triplesWithPositionsPerBlock.positionMap_[blockIndex].erase( - tripleWithPosition); + auto erase = [](LocatedTriples::iterator locatedTriple, + LocatedTriplesPerBlock& locatedTriplesPerBlock) { + size_t blockIndex = locatedTriple->blockIndex; + locatedTriplesPerBlock.map_[blockIndex].erase(locatedTriple); }; // Now erase for all permutations. - erase(iterators.iteratorPSO, triplesWithPositionsPerBlockInPSO_); - erase(iterators.iteratorPOS, triplesWithPositionsPerBlockInPOS_); - erase(iterators.iteratorSPO, triplesWithPositionsPerBlockInSPO_); - erase(iterators.iteratorSOP, triplesWithPositionsPerBlockInSOP_); - erase(iterators.iteratorOSP, triplesWithPositionsPerBlockInOSP_); - erase(iterators.iteratorOPS, triplesWithPositionsPerBlockInOPS_); + erase(handles.forPSO, locatedTriplesPerBlockInPSO_); + erase(handles.forPOS, locatedTriplesPerBlockInPOS_); + erase(handles.forSPO, locatedTriplesPerBlockInSPO_); + erase(handles.forSOP, locatedTriplesPerBlockInSOP_); + erase(handles.forOSP, locatedTriplesPerBlockInOSP_); + erase(handles.forOPS, locatedTriplesPerBlockInOPS_); }; // ____________________________________________________________________________ @@ -56,7 +54,7 @@ void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { } // When re-inserting a previously deleted triple, we need to remove the triple // from `triplesDeleted_` AND remove it from all - // `triplesWithPositionsPerBlock` (one per permutation) as well. + // `locatedTriplesPerBlock` (one per permutation) as well. if (triplesDeleted_.contains(idTriple)) { eraseTripleInAllPermutations(triplesDeleted_.at(idTriple)); triplesDeleted_.erase(idTriple); @@ -65,9 +63,9 @@ void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { // Locate the triple in one of the permutations (it does not matter which one) // to check if it already exists in the index. If it already exists, the // insertion is invalid, otherwise insert it. - TripleWithPosition tripleWithPosition = locateTripleInPermutation( + LocatedTriple locatedTriple = locateTripleInPermutation( idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); - if (tripleWithPosition.existsInIndex) { + if (locatedTriple.existsInIndex) { throw std::runtime_error( absl::StrCat("Triple \"", turtleTriple.toString(), "\" already exists in the original index", @@ -91,7 +89,7 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { } // When deleting a previously inserted triple (that did not exist in the index // before), we need to remove the triple from `triplesInserted_` AND remove it - // from all `triplesWithPositionsPerBlock` (one per permutation) as well. + // from all `locatedTriplesPerBlock` (one per permutation) as well. if (triplesInserted_.contains(idTriple)) { eraseTripleInAllPermutations(triplesInserted_.at(idTriple)); triplesInserted_.erase(idTriple); @@ -100,9 +98,9 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { // Locate the triple in one of the permutations (it does not matter which one) // to check if it actually exists in the index. If it does not exist, the // deletion is invalid, otherwise add as deleted triple. - TripleWithPosition tripleWithPosition = locateTripleInPermutation( + LocatedTriple locatedTriple = locateTripleInPermutation( idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); - if (!tripleWithPosition.existsInIndex) { + if (!locatedTriple.existsInIndex) { throw std::runtime_error( absl::StrCat("Triple \"", turtleTriple.toString(), "\" does not exist in the original index", @@ -113,30 +111,28 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { } // ____________________________________________________________________________ -const DeltaTriples::TriplesWithPositionsPerBlock& -DeltaTriples::getTriplesWithPositionsPerBlock( +const LocatedTriplesPerBlock& DeltaTriples::getTriplesWithPositionsPerBlock( Index::Permutation permutation) const { switch (permutation) { case Index::Permutation::PSO: - return triplesWithPositionsPerBlockInPSO_; + return locatedTriplesPerBlockInPSO_; case Index::Permutation::POS: - return triplesWithPositionsPerBlockInPOS_; + return locatedTriplesPerBlockInPOS_; case Index::Permutation::SPO: - return triplesWithPositionsPerBlockInSPO_; + return locatedTriplesPerBlockInSPO_; case Index::Permutation::SOP: - return triplesWithPositionsPerBlockInSOP_; + return locatedTriplesPerBlockInSOP_; case Index::Permutation::OSP: - return triplesWithPositionsPerBlockInOSP_; + return locatedTriplesPerBlockInOSP_; case Index::Permutation::OPS: - return triplesWithPositionsPerBlockInOPS_; + return locatedTriplesPerBlockInOPS_; default: AD_FAIL(); } } // ____________________________________________________________________________ -DeltaTriples::IdTriple DeltaTriples::getIdTriple( - const TurtleTriple& turtleTriple) { +IdTriple DeltaTriples::getIdTriple(const TurtleTriple& turtleTriple) { TripleComponent subject = std::move(turtleTriple._subject); TripleComponent predicate = std::move(turtleTriple._predicate); TripleComponent object = std::move(turtleTriple._object); @@ -147,55 +143,28 @@ DeltaTriples::IdTriple DeltaTriples::getIdTriple( } // ____________________________________________________________________________ -DeltaTriples::TriplesWithPositionsIterators -DeltaTriples::locateTripleInAllPermutations(const IdTriple& idTriple) { - // Helper lambda for adding `tripleWithPosition` to given - // `TriplesWithPositionsPerBlock` list. - auto addTripleWithPosition = - [&](const TripleWithPosition& tripleWithPosition, - TriplesWithPositionsPerBlock& triplesWithPositionsPerBlock) - -> TriplesWithPositions::iterator { - TriplesWithPositions& triplesWithPositions = - triplesWithPositionsPerBlock - .positionMap_[tripleWithPosition.blockIndex]; - auto [iterator, wasInserted] = - triplesWithPositions.emplace(tripleWithPosition); - AD_CORRECTNESS_CHECK(wasInserted == true); - AD_CORRECTNESS_CHECK(iterator != triplesWithPositions.end()); - ++triplesWithPositionsPerBlock.size_; - return iterator; - }; - - // Now locate the triple in each permutation and add it to the correct - // `TriplesWithPositionsPerBlock` list. +DeltaTriples::LocatedTripleHandles DeltaTriples::locateTripleInAllPermutations( + const IdTriple& idTriple) { auto [s, p, o] = idTriple; - TriplesWithPositionsIterators result; - result.iteratorPSO = addTripleWithPosition( - locateTripleInPermutation(p, s, o, index_.getImpl().PSO()), - triplesWithPositionsPerBlockInPSO_); - result.iteratorPOS = addTripleWithPosition( - locateTripleInPermutation(p, o, s, index_.getImpl().POS()), - triplesWithPositionsPerBlockInPOS_); - result.iteratorSPO = addTripleWithPosition( - locateTripleInPermutation(s, p, o, index_.getImpl().SPO()), - triplesWithPositionsPerBlockInSPO_); - result.iteratorSOP = addTripleWithPosition( - locateTripleInPermutation(s, o, p, index_.getImpl().SOP()), - triplesWithPositionsPerBlockInSOP_); - result.iteratorOSP = addTripleWithPosition( - locateTripleInPermutation(o, s, p, index_.getImpl().OSP()), - triplesWithPositionsPerBlockInOSP_); - result.iteratorOPS = addTripleWithPosition( - locateTripleInPermutation(o, p, s, index_.getImpl().OPS()), - triplesWithPositionsPerBlockInOPS_); - - // Return the iterators. - return result; + LocatedTripleHandles handles; + handles.forPSO = locatedTriplesPerBlockInPSO_.add( + locateTripleInPermutation(p, s, o, index_.getImpl().PSO())); + handles.forPOS = locatedTriplesPerBlockInPOS_.add( + locateTripleInPermutation(p, o, s, index_.getImpl().POS())); + handles.forSPO = locatedTriplesPerBlockInSPO_.add( + locateTripleInPermutation(s, p, o, index_.getImpl().SPO())); + handles.forSOP = locatedTriplesPerBlockInSOP_.add( + locateTripleInPermutation(s, o, p, index_.getImpl().SOP())); + handles.forOSP = locatedTriplesPerBlockInOSP_.add( + locateTripleInPermutation(o, s, p, index_.getImpl().OSP())); + handles.forOPS = locatedTriplesPerBlockInOPS_.add( + locateTripleInPermutation(o, p, s, index_.getImpl().OPS())); + return handles; } // ____________________________________________________________________________ template -DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( +LocatedTriple DeltaTriples::locateTripleInPermutation( Id id1, Id id2, Id id3, Permutation& permutation) const { // Get the internal data structures from the permutation. auto& file = permutation._file; @@ -228,7 +197,7 @@ DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( // Preliminary `FindTripleResult` object with the correct `blockIndex` and // IDs, but still an invalid `rowIndexInBlock` and `existsInIndex` set to // `false`. - TripleWithPosition tripleWithPosition{ + LocatedTriple locatedTriple{ blockIndex, std::numeric_limits::max(), id1, id2, id3, false}; // If all IDs from all blocks are smaller, we return the index of the last @@ -237,7 +206,7 @@ DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( // case). if (matchingBlock == blocks.end()) { AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); - return tripleWithPosition; + return locatedTriple; } // Read and decompress the block. Note that we are potentially doing this a @@ -289,7 +258,7 @@ DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( // Otherwise, `id` is the next larger ID and the position of the first triple // of that relation is exactly the position we are looking for. if (id == id1) { - tripleWithPosition.rowIndexInBlock = + locatedTriple.rowIndexInBlock = std::lower_bound(blockTuples.begin() + offsetBegin, blockTuples.begin() + offsetEnd, std::array{id2, id3}, @@ -299,28 +268,16 @@ DeltaTriples::TripleWithPosition DeltaTriples::locateTripleInPermutation( blockTuples.begin(); // Check if the triple at the found position is equal to `id1 id2 id3`. Note // that our default for `existsInIndex` was set to `false` above. - const size_t& i = tripleWithPosition.rowIndexInBlock; + const size_t& i = locatedTriple.rowIndexInBlock; AD_CORRECTNESS_CHECK(i < blockTuples.size()); if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { - tripleWithPosition.existsInIndex = true; + locatedTriple.existsInIndex = true; } } else { AD_CORRECTNESS_CHECK(id1 < id); - tripleWithPosition.rowIndexInBlock = offsetBegin; + locatedTriple.rowIndexInBlock = offsetBegin; } // Return the result. - return tripleWithPosition; + return locatedTriple; } - -// ____________________________________________________________________________ -std::string DeltaTriples::getNameForId(Id id) const { - auto lookupResult = - ExportQueryExecutionTrees::idToStringAndType(index_, id, localVocab_); - AD_CONTRACT_CHECK(lookupResult.has_value()); - const auto& [value, type] = lookupResult.value(); - // std::ostringstream os; - // os << "[" << id << "]"; - return type ? absl::StrCat("\"", value, "\"^^<", type, ">") : value; - // : absl::StrCat(value, " ", os.str()); -}; diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 88c53e3958..ce90b73145 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -5,184 +5,51 @@ #pragma once #include "engine/LocalVocab.h" +#include "global/IdTriple.h" #include "index/Index.h" #include "index/IndexBuilderTypes.h" +#include "index/LocatedTriple.h" #include "parser/TurtleParser.h" #include "util/HashSet.h" -// The `DeltaTriples` class needs to know the `Index` to which it belongs (for -// translating the components of a Turtle triple to `Id`s and for locating `Id`s -// in the permutations). However, the `Index` class also needs to know the -// `DeltaTriples` (in order to consider these triples when scanning a -// permutation). To avoid a circular include, and since we only need an `Index&` -// here, we can use a forward declaration. -// class Index; - -// A class for maintaining triples that were inserted or deleted after index -// building. -// -// HOW IT WORKS: +// A class for maintaining triples that are inserted or deleted after index +// building, we call these delta triples. How it works in principle: // -// 1. For each "delta triple", find the "matching position" (block index and -// index within that block, see below for a precise definition) for each index -// permutation. +// 1. For each delta triple, find the location in each permutation (block index +// and index within that block, see end of the file for an exact definition). // // 2. For each permutation and each block, store a sorted list of the positions // of the delta triples within that block. // -// 3. In the call of `CompressedRelation::scan`, for each block use the -// information from 2. to check whether there are delta triples for that block -// and if yes, merge them with the triples from the block (given that the -// positions are sorted, this can be done with negligible overhead). -// -// NOTE: For now, this only works when the results of index scans are not -// cached (at least not when there are relevant delta triples for that scan). -// There are two ways how this can play out in the future: -// -// Either we generally do not cache the results of index scans anymore. This -// would have various advantages, in particular, joining with something like -// `rdf:type` would then be possible without storing the whole relation in -// RAM. However, we need a faster decompression then and maybe a smaller block -// size (currently 8 MB). -// -// Or we add the delta triples when iterating over the cached (uncompressed) -// result from the index scan. In that case, we would need to (in Step 1 above) -// store and maintain the positions in those uncompressed index scans. -// -// POSITION WHERE A TRIPLE "FITS" IN A PERMUTATION: -// -// 1. If the triple in contained in the permutation, it is contained exactly -// once and so there is a well defined block and position in that block. -// -// 2. If there is a block, where the first triple is smaller and the last triple -// is larger, then that is the block and the position in that block is that of -// the first triple that is (not smaller and hence) larger. -// -// 3. If the triple falls "between two blocks" (the last triple of the previous -// block is smaller and the first triple of the next block is larger), then take -// the first position in that next block. -// -// 4. Two possibilities remain: -// 4a. The triple is smaller than the first triple of the first block: then take -// that block and the first position in that block. -// 4b. The triple is larger than the last triple of the last block: then take -// that block and the last position in that block. +// 3. In the call of `PermutationImpl::scan`, use the respective lists to merge +// the relevant delta tripless into the index scan result. // -// NOTE: For now, this is a proof-of-concept implementation and the class is -// simplistic in many ways (TODO: make a list, in which ways exactly). class DeltaTriples { - public: - // Inside this class, we are working with triples of IDs. - using IdTriple = std::array; - - // Hash value for such triple. - template - friend H AbslHashValue(H h, const IdTriple& triple) { - return H::combine(std::move(h), triple[0], triple[1], triple[2]); - } - - // Result record returned by `locateTripleInPermutation`. - // - // NOTE: This is currently more information then we need. In particular, the - // `blockIndex` is already implicit in `TriplesWithPositionsPerBlock` and the - // bit `existsInOriginalIndex_` can be derived using the information stored in - // a block and our metadata. However, both are useful for testing and for a - // small nuber of delta triples (think millions), the space efficiency of this - // class is not a significant issue. - struct TripleWithPosition { - // The index of the block and the position within that block, where the - // triple "fits". - size_t blockIndex; - size_t rowIndexInBlock; - // The `Id`s of the triple in the order of the permutation. For example, - // for an object pertaining to the SPO permutation: `id1` is the subject, - // `id2` is the predicate, and `id3` is the object. - Id id1; - Id id2; - Id id3; - // Whether the triple exists in the original index or is new. - bool existsInIndex; - }; - - // All delta triples located at the same position in the original index. - // - // NOTE: A lambda does not work here because it has to be `static constexpr` - // and then I get a strange warning about "a field ... whose type uses the - // anonymous namespace". I also tried overloading `std::less`, but the - // required `namespace std { ... }` does not work at this point in the code, - // and I didn't want to have it somewhere else than here. - struct TriplesWithPositionsCompare { - bool operator()(const TripleWithPosition& x, - const TripleWithPosition& y) const { - return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; - } - }; - using TriplesWithPositions = - std::set; - using TriplesWithPositionsIterator = TriplesWithPositions::iterator; - - // Data structures with positions for a particular permutation. - class TriplesWithPositionsPerBlock { - private: - // The number of `TripleWithPosition` objects stored. - size_t size_ = 0; - friend DeltaTriples; - - public: - // Map from block index to position list. - // - // TODO: Keep the position list for each block index sorted (primary key: - // row index in block, secondary key: triple order). - // - // TODO: Should be private, but we want to iterate over it for testing. - ad_utility::HashMap positionMap_; - - public: - // Get the positions for a given block index. Returns an empty list if there - // are no positions for that block index. - // - // TODO: Check if that is the behavior we want when actually using class - // `DeltaTriples` to augment the result of an index scan. - TriplesWithPositions getTriplesWithPositionsForBlock(size_t blockIndex) { - auto it = positionMap_.find(blockIndex); - if (it != positionMap_.end()) { - return it->second; - } else { - return {}; - } - } - - // Get the number of `TripleWithPosition` objects. - size_t size() const { return size_; } - - // Empty the data structure. - void clear() { - positionMap_.clear(); - size_ = 0; - } - }; - private: // The index to which these triples are added. const Index& index_; - // The positions of the delta triples in each of the six permutations. - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPSO_; - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInPOS_; - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInSPO_; - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInSOP_; - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInOSP_; - TriplesWithPositionsPerBlock triplesWithPositionsPerBlockInOPS_; + // The local vocabulary of the delta triples (they may have components, + // which are not contained in the vocabulary of the original index). + LocalVocab localVocab_; - // Each inserted or deleted triple needs to know where it is stored in each of - // the six `TriplesWithPositionsPerBlock` above. - struct TriplesWithPositionsIterators { - TriplesWithPositions::iterator iteratorPSO; - TriplesWithPositions::iterator iteratorPOS; - TriplesWithPositions::iterator iteratorSPO; - TriplesWithPositions::iterator iteratorSOP; - TriplesWithPositions::iterator iteratorOSP; - TriplesWithPositions::iterator iteratorOPS; + // The positions of the delta triples in each of the six permutations. + LocatedTriplesPerBlock locatedTriplesPerBlockInPSO_; + LocatedTriplesPerBlock locatedTriplesPerBlockInPOS_; + LocatedTriplesPerBlock locatedTriplesPerBlockInSPO_; + LocatedTriplesPerBlock locatedTriplesPerBlockInSOP_; + LocatedTriplesPerBlock locatedTriplesPerBlockInOSP_; + LocatedTriplesPerBlock locatedTriplesPerBlockInOPS_; + + // Each delta triple needs to know where it is stored in each of the six + // `LocatedTriplesPerBlock` above. + struct LocatedTripleHandles { + LocatedTriples::iterator forPSO; + LocatedTriples::iterator forPOS; + LocatedTriples::iterator forSPO; + LocatedTriples::iterator forSOP; + LocatedTriples::iterator forOPS; + LocatedTriples::iterator forOSP; }; // The sets of triples added to and subtracted from the original index @@ -191,16 +58,20 @@ class DeltaTriples { // triples are added that are not already contained in the original index and // that only triples are subtracted that are contained in the original index. // In particular, no triple can be in both of these sets. - ad_utility::HashMap triplesInserted_; - ad_utility::HashMap triplesDeleted_; - - // The local vocabulary of these triples. - LocalVocab localVocab_; + ad_utility::HashMap triplesInserted_; + ad_utility::HashMap triplesDeleted_; public: // Construct for given index. DeltaTriples(const Index& index) : index_(index) {} + // Get the `Index` to which these delta triples refer. + const Index& getIndex() const { return index_; } + + // Get the common `LocalVocab` of the delta triples. + LocalVocab& localVocab() { return localVocab_; } + const LocalVocab& localVocab() const { return localVocab_; } + // Clear `_triplesAdded` and `_triplesSubtracted` and all associated data // structures. void clear(); @@ -216,7 +87,7 @@ class DeltaTriples { void deleteTriple(TurtleTriple turtleTriple); // Get `TripleWithPosition` objects for given permutation. - const TriplesWithPositionsPerBlock& getTriplesWithPositionsPerBlock( + const LocatedTriplesPerBlock& getTriplesWithPositionsPerBlock( Index::Permutation permutation) const; // TODO: made public as long as we are trying to figure out how this works. @@ -231,29 +102,68 @@ class DeltaTriples { IdTriple getIdTriple(const TurtleTriple& turtleTriple); // Find the position of the given triple in the given permutation and add it - // to each of the six `TriplesWithPositionsPerBlock` maps (one per + // to each of the six `LocatedTriplesPerBlock` maps (one per // permutation). Return the iterators of where it was added (so that we can // easily delete it again from these maps later). // // TODO: The function is name is misleading, since this method does not only // locate, but also add to the mentioned data structures. - TriplesWithPositionsIterators locateTripleInAllPermutations( - const IdTriple& idTriple); + LocatedTripleHandles locateTripleInAllPermutations(const IdTriple& idTriple); // The implementation of the above function for a given permutation. template - TripleWithPosition locateTripleInPermutation(Id id1, Id id2, Id id3, - Permutation& permutation) const; + LocatedTriple locateTripleInPermutation(Id id1, Id id2, Id id3, + Permutation& permutation) const; - // Erase `TripleWithPosition` object from each `TriplesWithPositionsPerBlock` - // list. The argument are iterators for each list, as returned by the method + // Erase `LocatedTriple` object from each `LocatedTriplesPerBlock` list. The + // argument are iterators for each list, as returned by the method // `locateTripleInAllPermutations` above. // // NOTE: The iterators are invalid afterwards. That is OK, as long as we also // delete the respective entry in `triplesInserted_` or `triplesDeleted_`, // which stores these iterators. - void eraseTripleInAllPermutations(TriplesWithPositionsIterators& iterators); - - // Resolve ID to name (useful for debugging and testing). - std::string getNameForId(Id id) const; + void eraseTripleInAllPermutations(LocatedTripleHandles& handles); }; + +// More detailed discussion and information about the `DeltaTriples` class. +// +// A. DELTA TRIPLES AND THE CACHE +// +// For now, our approach only works when the results of index scans are not +// cached (unless there are no relevant delta triples for a particular scan). +// There are two ways how this can play out in the future: +// +// Either we generally do not cache the results of index scans anymore. This +// would have various advantages, in particular, joining with something like +// `rdf:type` would then be possible without storing the whole relation in +// RAM. However, we need a faster decompression then and maybe a smaller block +// size (currently 8 MB). +// +// Or we add the delta triples when iterating over the cached (uncompressed) +// result from the index scan. In that case, we would need to (in Step 1 above) +// store and maintain the positions in those uncompressed index scans. However, +// this would only work for the results of index scans. For the results of more +// complex subqueries, it's hard to figure out which delta triples are relevant. +// +// B. DEFINITION OF THE POSITION OF A DELTA TRIPLE IN A PERMUTATION +// +// 1. The position is defined by the index of a block in the permutation and the +// index of a row within that block. +// +// 2. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then the +// position is the first position in that next block. +// +// 4. As a special case of 3., if the triple is smaller than all triples in the +// permutation, the position is the first position of the first block. +// +// 5. If the triple is larger than all triples in the permutation, the block +// index is one after the largest block index and the position within that +// non-existing block is arbitrary. diff --git a/src/index/LocatedTriple.h b/src/index/LocatedTriple.h new file mode 100644 index 0000000000..4d7b697fe6 --- /dev/null +++ b/src/index/LocatedTriple.h @@ -0,0 +1,97 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "global/IdTriple.h" +#include "util/HashMap.h" + +#pragma once + +// Result record returned by `locateTripleInPermutation`. +// +// NOTE: This is currently more information then we need. In particular, the +// `blockIndex` is already implicit in `LocatedTriplesPerBlock` and the +// bit `existsInOriginalIndex_` can be derived using the information stored in +// a block and our metadata. However, both are useful for testing and for a +// small nuber of delta triples (think millions), the space efficiency of this +// class is not a significant issue. +struct LocatedTriple { + // The index of the block and the position within that block, where the + // triple "fits". + size_t blockIndex; + size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. + Id id1; + Id id2; + Id id3; + // Whether the triple exists in the original index or is new. + bool existsInIndex; +}; + +// All delta triples located at the same position in the original index. +// +// NOTE: A lambda does not work here because it has to be `static constexpr` +// and then I get a strange warning about "a field ... whose type uses the +// anonymous namespace". I also tried overloading `std::less`, but the +// required `namespace std { ... }` does not work at this point in the code, +// and I didn't want to have it somewhere else than here. +struct LocatedTripleCompare { + bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { + return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; + } +}; +using LocatedTriples = std::set; + +// Data structures with positions for a particular permutation. +class LocatedTriplesPerBlock { + private: + // The number of `LocatedTriple` objects stored. + size_t size_ = 0; + + public: + // Map from block index to position list. + // + // TODO: Keep the position list for each block index sorted (primary key: + // row index in block, secondary key: triple order). + // + // TODO: Should be private, but we want to iterate over it for testing. + ad_utility::HashMap map_; + + public: + // Get the positions for a given block index. Returns an empty list if there + // are no positions for that block index. + // + // TODO: Check if that is the behavior we want when actually using class + // `DeltaTriples` to augment the result of an index scan. + LocatedTriples getLocatedTriplesForBlock(size_t blockIndex) { + auto it = map_.find(blockIndex); + if (it != map_.end()) { + return it->second; + } else { + return {}; + } + } + + // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. + // Returns a handle to where it was added (via which we can easily remove it + // again if we need to). + LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { + LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; + auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); + AD_CORRECTNESS_CHECK(wasInserted == true); + AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); + ++size_; + return handle; + }; + + // Get the total number of `LocatedTriple` objects (for all blocks). + size_t size() const { return size_; } + + // Empty the data structure. + void clear() { + map_.clear(); + size_ = 0; + } +}; diff --git a/src/index/Permutations.h b/src/index/Permutations.h index b6d7bb422e..00e0118acc 100644 --- a/src/index/Permutations.h +++ b/src/index/Permutations.h @@ -25,16 +25,14 @@ template class PermutationImpl { private: // The delta triples and their positions in this permutation. - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock_; + const LocatedTriplesPerBlock& locatedTriplesPerBlock_; public: using MetaData = MetaDataT; PermutationImpl(const Comparator& comp, string name, string suffix, array order, - const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock) - : triplesWithPositionsPerBlock_(triplesWithPositionsPerBlock), + const LocatedTriplesPerBlock& locatedTriplesPerBlock) + : locatedTriplesPerBlock_(locatedTriplesPerBlock), _comp(comp), _readableName(std::move(name)), _fileSuffix(std::move(suffix)), @@ -75,7 +73,7 @@ class PermutationImpl { } const auto& metaData = _meta.getMetaData(col0Id); return _reader.scan(metaData, _meta.blockData(), _file, result, - std::move(timer), triplesWithPositionsPerBlock_); + std::move(timer), locatedTriplesPerBlock_); } /// For given IDs for the first and second column, retrieve all IDs of the /// third column, and store them in `result`. This is just a thin wrapper @@ -89,7 +87,7 @@ class PermutationImpl { const auto& metaData = _meta.getMetaData(col0Id); return _reader.scan(metaData, col1Id, _meta.blockData(), _file, result, - timer, triplesWithPositionsPerBlock_); + timer, locatedTriplesPerBlock_); } // _______________________________________________________ diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 253315940d..98e6d6d678 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -7,13 +7,13 @@ #include "./IndexTestHelpers.h" #include "./util/GTestHelpers.h" #include "absl/strings/str_split.h" +#include "engine/ExportQueryExecutionTrees.h" #include "index/DeltaTriples.h" #include "index/IndexImpl.h" #include "parser/TurtleParser.h" // Shortcuts to these full type names used frequently in the following. -// using DeltaTriples::IdTriple; -// using DeltaTriples::TriplesWithPositionsPerBlock; +// using IdTriple; static const std::vector permutationEnums = { Index::Permutation::PSO, Index::Permutation::POS, Index::Permutation::SPO, Index::Permutation::SOP, Index::Permutation::OPS, Index::Permutation::OSP}; @@ -62,11 +62,23 @@ class DeltaTriplesTest : public ::testing::Test { // Make `IdTriple` from given Turtle input (the first argument is not `const` // because we might change the local vocabulary). - DeltaTriples::IdTriple makeIdTriple(DeltaTriples& deltaTriples, - std::string turtle) { + IdTriple makeIdTriple(DeltaTriples& deltaTriples, std::string turtle) { return deltaTriples.getIdTriple(makeTurtleTriple(std::move(turtle))); } + // Resolve the name for the given `Id` using the `Index` and `LocalVocab` from + // the given `deltaTriples` object. + std::string getNameForId(Id id, const DeltaTriples& deltaTriples) const { + auto lookupResult = ExportQueryExecutionTrees::idToStringAndType( + deltaTriples.getIndex(), id, deltaTriples.localVocab()); + AD_CONTRACT_CHECK(lookupResult.has_value()); + const auto& [value, type] = lookupResult.value(); + // std::ostringstream os; + // os << "[" << id << "]"; + return type ? absl::StrCat("\"", value, "\"^^<", type, ">") : value; + // : absl::StrCat(value, " ", os.str()); + }; + // Get human-readable names for the given `permutation` and `idTriple`. This // is needed for proper message when an assert fails in the tests below. The // `idTriple` is assumed to be already in the right permutation (for example, @@ -74,11 +86,11 @@ class DeltaTriplesTest : public ::testing::Test { template std::pair getNicePermutationAndTripleName( const DeltaTriples& deltaTriples, const Permutation& permutation, - const DeltaTriples::IdTriple idTriple) { + const IdTriple idTriple) { auto& namePermutation = permutation._readableName; - std::string nameId1 = deltaTriples.getNameForId(idTriple[0]); - std::string nameId2 = deltaTriples.getNameForId(idTriple[1]); - std::string nameId3 = deltaTriples.getNameForId(idTriple[2]); + std::string nameId1 = getNameForId(idTriple[0], deltaTriples); + std::string nameId2 = getNameForId(idTriple[1], deltaTriples); + std::string nameId3 = getNameForId(idTriple[2], deltaTriples); std::string nameTriple = absl::StrCat(std::string{namePermutation[0]}, "=", nameId1, " ", std::string{namePermutation[1]}, "=", nameId2, " ", @@ -87,7 +99,7 @@ class DeltaTriplesTest : public ::testing::Test { } // Check that all six `triplesWithPositionsPerBlock` lists have the given - // number of `TripleWithPosition` objects. + // number of `LocatedTriple` objects. void checkTriplesWithPositionsPerBlockSize(const DeltaTriples& deltaTriples, size_t expectedSize) { for (Index::Permutation permutation : permutationEnums) { @@ -204,7 +216,7 @@ TEST_F(DeltaTriplesTest, showAllRelationIdsForPermutation) { std::cout << "Block #" << (++blockCount) << ":"; for (const Id& id : block) { std::cout << " " - << (id != Id::makeUndefined() ? deltaTriples.getNameForId(id) + << (id != Id::makeUndefined() ? getNameForId(id, deltaTriples) : "UNDEF") << std::flush; } @@ -255,26 +267,24 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { const Index& index = testQec->getIndex(); DeltaTriples deltaTriples(index); - // Check the given `tripleWithPosition` (a block index, an index in the + // Check the given `locatedTriple` (a block index, an index in the // block, and a triple) is correct for the given permutation as follows: // - // 1. If `tripleWithPosition.existsInIndex == true`, check that the + // 1. If `locatedTriple.existsInIndex == true`, check that the // triple indeed occurs at that position in the respective triple. // - // 2. If `tripleWithPosition.existsInIndex == false`, check that the + // 2. If `locatedTriple.existsInIndex == false`, check that the // triple at the position is larger and the triple at the previous // position is smaller. auto checkTripleWithPositionInPermutation = - [&](const DeltaTriples::TripleWithPosition& tripleWithPosition, - const auto& permutation, + [&](const LocatedTriple& locatedTriple, const auto& permutation, const std::vector>& relationIdsPerBlock) { // Shortcuts for the tiples ids and its position. - const size_t blockIndex = tripleWithPosition.blockIndex; - const size_t rowIndexInBlock = tripleWithPosition.rowIndexInBlock; - const bool existsInIndex = tripleWithPosition.existsInIndex; - const DeltaTriples::IdTriple deltaTriple{tripleWithPosition.id1, - tripleWithPosition.id2, - tripleWithPosition.id3}; + const size_t blockIndex = locatedTriple.blockIndex; + const size_t rowIndexInBlock = locatedTriple.rowIndexInBlock; + const bool existsInIndex = locatedTriple.existsInIndex; + const IdTriple deltaTriple{locatedTriple.id1, locatedTriple.id2, + locatedTriple.id3}; // Members for accessing the data of a permutation. auto& file = permutation._file; @@ -298,9 +308,9 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { const vector& metadataPerBlock = meta.blockData(); AD_CONTRACT_CHECK(metadataPerBlock.size() > 0); - DeltaTriples::IdTriple lastTriple{metadataPerBlock.back()._col0LastId, - metadataPerBlock.back()._col1LastId, - metadataPerBlock.back()._col2LastId}; + IdTriple lastTriple{metadataPerBlock.back()._col0LastId, + metadataPerBlock.back()._col1LastId, + metadataPerBlock.back()._col2LastId}; if (blockIndex >= metadataPerBlock.size()) { ASSERT_EQ(blockIndex, metadataPerBlock.size()) << msg; ASSERT_FALSE(existsInIndex); @@ -322,23 +332,22 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { const auto& blockTuples = reader.readAndDecompressBlock(blockMetadata, file, std::nullopt); ASSERT_LT(rowIndexInBlock, blockTuples.size()) << msg; - DeltaTriples::IdTriple blockTriple{ - relationIdsPerBlock[blockIndex][rowIndexInBlock], - blockTuples(rowIndexInBlock, 0), blockTuples(rowIndexInBlock, 1)}; - auto blockTriplePrevious = [&]() -> DeltaTriples::IdTriple { + IdTriple blockTriple{relationIdsPerBlock[blockIndex][rowIndexInBlock], + blockTuples(rowIndexInBlock, 0), + blockTuples(rowIndexInBlock, 1)}; + auto blockTriplePrevious = [&]() -> IdTriple { if (rowIndexInBlock > 0) { - return DeltaTriples::IdTriple{ + return IdTriple{ relationIdsPerBlock[blockIndex][rowIndexInBlock - 1], blockTuples(rowIndexInBlock - 1, 0), blockTuples(rowIndexInBlock - 1, 1)}; } else if (blockIndex > 0) { - return DeltaTriples::IdTriple{ - metadataPerBlock[blockIndex - 1]._col0LastId, - metadataPerBlock[blockIndex - 1]._col1LastId, - metadataPerBlock[blockIndex - 1]._col2LastId}; + return IdTriple{metadataPerBlock[blockIndex - 1]._col0LastId, + metadataPerBlock[blockIndex - 1]._col1LastId, + metadataPerBlock[blockIndex - 1]._col2LastId}; } else { - return DeltaTriples::IdTriple{ - Id::makeUndefined(), Id::makeUndefined(), Id::makeUndefined()}; + return IdTriple{Id::makeUndefined(), Id::makeUndefined(), + Id::makeUndefined()}; } }(); @@ -353,24 +362,23 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { } }; - // Check that all `tripleWithPosition` in `positionsPerBlock` are + // Check that all `locatedTriple` in `positionsPerBlock` are // correct for the given permutation. auto checkAllTriplesWithPositionsForPermutation = - [&](const DeltaTriples::TriplesWithPositionsPerBlock& - triplesWithPositionsPerBlock, + [&](const LocatedTriplesPerBlock& triplesWithPositionsPerBlock, const auto& permutation) { std::vector> allRelationIdsForPermutation = getAllRelationIdsForPermutation(permutation); for (const auto& [blockIndex, triplesWithPositions] : - triplesWithPositionsPerBlock.positionMap_) { - for (const auto& tripleWithPosition : triplesWithPositions) { - checkTripleWithPositionInPermutation( - tripleWithPosition, permutation, allRelationIdsForPermutation); + triplesWithPositionsPerBlock.map_) { + for (const auto& locatedTriple : triplesWithPositions) { + checkTripleWithPositionInPermutation(locatedTriple, permutation, + allRelationIdsForPermutation); } } }; - // Check that all `tripleWithPosition`s are correct (for all + // Check that all `locatedTriple`s are correct (for all // permutations). the given permutation. auto checkAllTriplesWithPositionForAllPermutations = [&](const DeltaTriples& deltaTriples) { @@ -398,11 +406,11 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { // permutation. // // TODO: Check that `existsInIndex` was set correctly. Test test routine - // above just take it from the tested `TripleWithPosition` objects + // above just take it from the tested `LocatedTriple` objects // (which might be wrong) // // TODO: Check that each triple that was located was indeed added to - // each of the `TriplesWithPositionsPerBlock` objects. + // each of the `LocatedTriplesPerBlock` objects. // // TODO: Eventually, we should test `insertTriple` and `deleteTriple`, // which only insert a triple when it doesn't exist in the original @@ -469,70 +477,67 @@ TEST_F(DeltaTriplesTest, findTripleInAllPermutationsVisualize) { << std::endl; // Search the triple in all permutations. - DeltaTriples::IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); - auto iterators = deltaTriples.locateTripleInAllPermutations(idTriple); + IdTriple idTriple = makeIdTriple(deltaTriples, tripleAsString); + auto handles = deltaTriples.locateTripleInAllPermutations(idTriple); // Helper lambda for showing the block from the given permutation that - // contains the given (via an iterator) `TripleWithPosition` object. - auto showBlock = - [&](DeltaTriples::TriplesWithPositions::iterator& tripleWithPosition, - const auto& permutation) { - // Shortcuts for the triple and its position. - // AD_CORRECTNESS_CHECK(tripleWithPosition != tripleWithPosition.end()); - const size_t blockIndex = tripleWithPosition->blockIndex; - const size_t rowIndexInBlock = tripleWithPosition->rowIndexInBlock; - const bool existsInIndex = tripleWithPosition->existsInIndex; - const DeltaTriples::IdTriple deltaTriple{tripleWithPosition->id1, - tripleWithPosition->id2, - tripleWithPosition->id3}; - - // Get nice names for the permutation and the triple. - auto [namePermutation, nameTriple] = getNicePermutationAndTripleName( - deltaTriples, permutation, deltaTriple); + // contains the given (via an iterator) `LocatedTriple` object. + auto showBlock = [&](LocatedTriples::iterator& locatedTriple, + const auto& permutation) { + // Shortcuts for the triple and its position. + // AD_CORRECTNESS_CHECK(locatedTriple != locatedTriple.end()); + const size_t blockIndex = locatedTriple->blockIndex; + const size_t rowIndexInBlock = locatedTriple->rowIndexInBlock; + const bool existsInIndex = locatedTriple->existsInIndex; + const IdTriple deltaTriple{locatedTriple->id1, locatedTriple->id2, + locatedTriple->id3}; + + // Get nice names for the permutation and the triple. + auto [namePermutation, nameTriple] = + getNicePermutationAndTripleName(deltaTriples, permutation, deltaTriple); + + // If we are beyond the last block, there is nothing to show. + const vector& blockMetas = + permutation._meta.blockData(); + if (blockIndex >= blockMetas.size()) { + std::cout << endl; + std::cout << "All triples in " << namePermutation << " are smaller than " + << nameTriple << std::endl; + return; + } - // If we are beyond the last block, there is nothing to show. - const vector& blockMetas = - permutation._meta.blockData(); - if (blockIndex >= blockMetas.size()) { - std::cout << endl; - std::cout << "All triples in " << namePermutation - << " are smaller than " << nameTriple << std::endl; - return; - } + // Read the block and compute all relation `Id`s. + const CompressedBlockMetadata& blockMetadata = blockMetas[blockIndex]; + DecompressedBlock blockTuples = permutation._reader.readAndDecompressBlock( + blockMetadata, permutation._file, std::nullopt); + std::vector blockRelationIds = + getAllRelationIdsForPermutation(permutation).at(blockIndex); + AD_CORRECTNESS_CHECK(blockRelationIds.size() == blockTuples.size()); - // Read the block and compute all relation `Id`s. - const CompressedBlockMetadata& blockMetadata = blockMetas[blockIndex]; - DecompressedBlock blockTuples = - permutation._reader.readAndDecompressBlock( - blockMetadata, permutation._file, std::nullopt); - std::vector blockRelationIds = - getAllRelationIdsForPermutation(permutation).at(blockIndex); - AD_CORRECTNESS_CHECK(blockRelationIds.size() == blockTuples.size()); - - // Show the triples in the block. - std::cout << std::endl; - std::cout << "Block #" << blockIndex << " from " << namePermutation - << " (" << nameTriple << "):" << std::endl; - for (size_t i = 0; i < blockTuples.numRows(); ++i) { - std::cout << "Row #" << i << ": " - << deltaTriples.getNameForId(blockRelationIds[i]); - for (size_t j = 0; j < blockTuples.numColumns(); ++j) { - std::cout << " " << deltaTriples.getNameForId(blockTuples(i, j)); - } - if (i == rowIndexInBlock) { - std::cout << " <-- " - << (existsInIndex ? "existing triple" : "new triple"); - } - std::cout << std::endl; - } - }; + // Show the triples in the block. + std::cout << std::endl; + std::cout << "Block #" << blockIndex << " from " << namePermutation << " (" + << nameTriple << "):" << std::endl; + for (size_t i = 0; i < blockTuples.numRows(); ++i) { + std::cout << "Row #" << i << ": " + << getNameForId(blockRelationIds[i], deltaTriples); + for (size_t j = 0; j < blockTuples.numColumns(); ++j) { + std::cout << " " << getNameForId(blockTuples(i, j), deltaTriples); + } + if (i == rowIndexInBlock) { + std::cout << " <-- " + << (existsInIndex ? "existing triple" : "new triple"); + } + std::cout << std::endl; + } + }; // Show block for each permutation. - showBlock(iterators.iteratorPOS, index.getImpl().POS()); - showBlock(iterators.iteratorPSO, index.getImpl().PSO()); - showBlock(iterators.iteratorSPO, index.getImpl().SPO()); - showBlock(iterators.iteratorSOP, index.getImpl().SOP()); - showBlock(iterators.iteratorOSP, index.getImpl().OSP()); - showBlock(iterators.iteratorOPS, index.getImpl().OPS()); + showBlock(handles.forPOS, index.getImpl().POS()); + showBlock(handles.forPSO, index.getImpl().PSO()); + showBlock(handles.forSPO, index.getImpl().SPO()); + showBlock(handles.forSOP, index.getImpl().SOP()); + showBlock(handles.forOSP, index.getImpl().OSP()); + showBlock(handles.forOPS, index.getImpl().OPS()); std::cout << std::endl; } From 04861ce87b101fe10e81104d0646d40faa6b6c31 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 31 Mar 2023 20:41:16 +0200 Subject: [PATCH 09/20] Refactor code and first try to merge delta triples 1. The code for locating triples for an individual permutation is now no longer in the (already too big) class `DeltaTriples` but in separate classes in files `LocatedTriples.{h,cpp}`. The corresponding tests are also in a new file `test/LocatedTriplesTest.cpp` now. Used the opportunity to improve the code in several respects. 2. First attempt at writing a function that or a given block merges the relevant delta triples into it. Tried to do it in-place without using an extra array, until I realized that that leads to a very hard algorithmic problem, which most likely can't be solved practically efficiently. At least it's clear now that the best approach is to first decompress the block into a temporary array and the merge that temporary array and the relevant delta triples in the pre-allocated portion of the result `IdTable`. That's what I will implement next and it shouldn't be hard. 3. When testing the merging, it helps to output the columns of an `IdTable`. For that, values like `VocabIndex:15` are rather unpractical to read, so I abbreviated these to use a one-letter prefix, like in `V:15`. --- src/global/Id.h | 6 +- src/global/IdTriple.h | 4 + src/global/ValueId.h | 12 +- src/index/CMakeLists.txt | 1 + src/index/ConstantsIndexBuilding.h | 6 +- src/index/DeltaTriples.cpp | 171 +++-------------- src/index/DeltaTriples.h | 13 +- src/index/LocatedTriple.h | 97 ---------- src/index/LocatedTriples.cpp | 294 +++++++++++++++++++++++++++++ src/index/LocatedTriples.h | 123 ++++++++++++ test/CMakeLists.txt | 2 + test/DeltaTriplesTest.cpp | 6 +- test/LocatedTriplesTest.cpp | 96 ++++++++++ test/ValueIdTest.cpp | 12 +- test/ValuesForTestingTest.cpp | 4 +- 15 files changed, 574 insertions(+), 273 deletions(-) delete mode 100644 src/index/LocatedTriple.h create mode 100644 src/index/LocatedTriples.cpp create mode 100644 src/index/LocatedTriples.h create mode 100644 test/LocatedTriplesTest.cpp diff --git a/src/global/Id.h b/src/global/Id.h index 2eaacadc4f..62a96d0fff 100644 --- a/src/global/Id.h +++ b/src/global/Id.h @@ -8,9 +8,9 @@ #include #include -#include "../util/Exception.h" -#include "./IndexTypes.h" -#include "./ValueId.h" +#include "global/IndexTypes.h" +#include "global/ValueId.h" +#include "util/Exception.h" using Id = ValueId; typedef uint16_t Score; diff --git a/src/global/IdTriple.h b/src/global/IdTriple.h index d80e72dee7..0353b8c747 100644 --- a/src/global/IdTriple.h +++ b/src/global/IdTriple.h @@ -4,6 +4,10 @@ #pragma once +#include + +#include "global/Id.h" + // Should we have an own class for this? We need this at several places. using IdTriple = std::array; diff --git a/src/global/ValueId.h b/src/global/ValueId.h index 02855700d5..4163cf8b29 100644 --- a/src/global/ValueId.h +++ b/src/global/ValueId.h @@ -10,10 +10,10 @@ #include #include -#include "../util/BitUtils.h" -#include "../util/NBitInteger.h" -#include "../util/Serializer/Serializer.h" -#include "./IndexTypes.h" +#include "global/IndexTypes.h" +#include "util/BitUtils.h" +#include "util/NBitInteger.h" +#include "util/Serializer/Serializer.h" /// The different Datatypes that a `ValueId` (see below) can encode. enum struct Datatype { @@ -244,10 +244,10 @@ class ValueId { /// This operator is only for debugging and testing. It returns a /// human-readable representation. friend std::ostream& operator<<(std::ostream& ostr, const ValueId& id) { - ostr << toString(id.getDatatype()) << ':'; + ostr << toString(id.getDatatype())[0] << ':'; auto visitor = [&ostr](T&& value) { if constexpr (ad_utility::isSimilar) { - ostr << "Undefined"; + ostr << "xx"; } else if constexpr (ad_utility::isSimilar || ad_utility::isSimilar) { ostr << std::to_string(value); diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index e935088ff4..c1016af609 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -8,6 +8,7 @@ add_library(index VocabularyOnDisk.h VocabularyOnDisk.cpp IndexMetaData.h IndexMetaDataImpl.h MetaDataHandler.h + LocatedTriples.h LocatedTriples.cpp DeltaTriples.h DeltaTriples.cpp StxxlSortFunctors.h TextMetaData.cpp TextMetaData.h diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 40679aa42b..15426f56d3 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -84,8 +84,4 @@ constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; // // NOTE: For playing around with `DeltaTriples`, I am setting this to a // deliberately small number. -constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 16; // 64 KB. -// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 17; // 128 KB. -// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 13; // 8 KB. -// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 3 * 16; -// constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; +constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 100'000; diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index ee8dadfe14..7d8c9a5783 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -7,6 +7,7 @@ #include "absl/strings/str_cat.h" #include "index/Index.h" #include "index/IndexImpl.h" +#include "index/LocatedTriples.h" #include "parser/TurtleParser.h" #include "util/Timer.h" @@ -22,6 +23,32 @@ void DeltaTriples::clear() { locatedTriplesPerBlockInOPS_.clear(); } +// ____________________________________________________________________________ +DeltaTriples::LocatedTripleHandles DeltaTriples::locateTripleInAllPermutations( + const IdTriple& idTriple) { + auto [s, p, o] = idTriple; + LocatedTripleHandles handles; + handles.forPSO = + locatedTriplesPerBlockInPSO_.add(LocatedTriple::locateTripleInPermutation( + p, s, o, index_.getImpl().PSO())); + handles.forPOS = + locatedTriplesPerBlockInPOS_.add(LocatedTriple::locateTripleInPermutation( + p, o, s, index_.getImpl().POS())); + handles.forSPO = + locatedTriplesPerBlockInSPO_.add(LocatedTriple::locateTripleInPermutation( + s, p, o, index_.getImpl().SPO())); + handles.forSOP = + locatedTriplesPerBlockInSOP_.add(LocatedTriple::locateTripleInPermutation( + s, o, p, index_.getImpl().SOP())); + handles.forOSP = + locatedTriplesPerBlockInOSP_.add(LocatedTriple::locateTripleInPermutation( + o, s, p, index_.getImpl().OSP())); + handles.forOPS = + locatedTriplesPerBlockInOPS_.add(LocatedTriple::locateTripleInPermutation( + o, p, s, index_.getImpl().OPS())); + return handles; +} + // ____________________________________________________________________________ void DeltaTriples::eraseTripleInAllPermutations( DeltaTriples::LocatedTripleHandles& handles) { @@ -63,7 +90,7 @@ void DeltaTriples::insertTriple(TurtleTriple turtleTriple) { // Locate the triple in one of the permutations (it does not matter which one) // to check if it already exists in the index. If it already exists, the // insertion is invalid, otherwise insert it. - LocatedTriple locatedTriple = locateTripleInPermutation( + LocatedTriple locatedTriple = LocatedTriple::locateTripleInPermutation( idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); if (locatedTriple.existsInIndex) { throw std::runtime_error( @@ -98,7 +125,7 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { // Locate the triple in one of the permutations (it does not matter which one) // to check if it actually exists in the index. If it does not exist, the // deletion is invalid, otherwise add as deleted triple. - LocatedTriple locatedTriple = locateTripleInPermutation( + LocatedTriple locatedTriple = LocatedTriple::locateTripleInPermutation( idTriple[1], idTriple[0], idTriple[2], index_.getImpl().PSO()); if (!locatedTriple.existsInIndex) { throw std::runtime_error( @@ -141,143 +168,3 @@ IdTriple DeltaTriples::getIdTriple(const TurtleTriple& turtleTriple) { Id objectId = std::move(object).toValueId(index_.getVocab(), localVocab_); return IdTriple{subjectId, predId, objectId}; } - -// ____________________________________________________________________________ -DeltaTriples::LocatedTripleHandles DeltaTriples::locateTripleInAllPermutations( - const IdTriple& idTriple) { - auto [s, p, o] = idTriple; - LocatedTripleHandles handles; - handles.forPSO = locatedTriplesPerBlockInPSO_.add( - locateTripleInPermutation(p, s, o, index_.getImpl().PSO())); - handles.forPOS = locatedTriplesPerBlockInPOS_.add( - locateTripleInPermutation(p, o, s, index_.getImpl().POS())); - handles.forSPO = locatedTriplesPerBlockInSPO_.add( - locateTripleInPermutation(s, p, o, index_.getImpl().SPO())); - handles.forSOP = locatedTriplesPerBlockInSOP_.add( - locateTripleInPermutation(s, o, p, index_.getImpl().SOP())); - handles.forOSP = locatedTriplesPerBlockInOSP_.add( - locateTripleInPermutation(o, s, p, index_.getImpl().OSP())); - handles.forOPS = locatedTriplesPerBlockInOPS_.add( - locateTripleInPermutation(o, p, s, index_.getImpl().OPS())); - return handles; -} - -// ____________________________________________________________________________ -template -LocatedTriple DeltaTriples::locateTripleInPermutation( - Id id1, Id id2, Id id3, Permutation& permutation) const { - // Get the internal data structures from the permutation. - auto& file = permutation._file; - const auto& meta = permutation._meta; - const auto& reader = permutation._reader; - - // Find the index of the first block where the last triple is not smaller. - // - // NOTE: With `_col2LastId` added to `CompressedBlockMetadata`, this can now - // be computed without having to decompress any blocks at this point. See the - // first revision of this branch for code, where blocks with equal `id1` and - // `id2` were decompressed to also check for `id3`. - const vector& blocks = meta.blockData(); - auto matchingBlock = std::lower_bound( - blocks.begin(), blocks.end(), std::array{id1, id2, id3}, - [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { - if (block._col0LastId < triple[0]) { - return true; - } else if (block._col0LastId == triple[0]) { - if (block._col1LastId < triple[1]) { - return true; - } else if (block._col1LastId == triple[1]) { - return block._col2LastId < triple[2]; - } - } - return false; - }); - size_t blockIndex = matchingBlock - blocks.begin(); - - // Preliminary `FindTripleResult` object with the correct `blockIndex` and - // IDs, but still an invalid `rowIndexInBlock` and `existsInIndex` set to - // `false`. - LocatedTriple locatedTriple{ - blockIndex, std::numeric_limits::max(), id1, id2, id3, false}; - - // If all IDs from all blocks are smaller, we return the index of the last - // block plus one (typical "end" semantics) and any position in the block (in - // the code that uses the result, that position will not be used in this - // case). - if (matchingBlock == blocks.end()) { - AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); - return locatedTriple; - } - - // Read and decompress the block. Note that we are potentially doing this a - // second time here (the block has probably already been looked at in the call - // to `std::lower_bound` above). - DecompressedBlock blockTuples = - reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); - - // Find the smallest "relation" ID that is not smaller than `id1` and get its - // metadata and the position of the first and last triple with that ID in the - // block. - // - // IMPORTANT FIX: If relation `id1` exists in the index, but our triple is - // larger than all triples of that relation in the index and the last triple - // of that relation ends a block, then our block search above (correctly) - // landed us at the next block. We can detect this by checking whether the - // first relation ID of the block is larger than `id1` and then we should - // get the metadata for the ID and not for `id1` (which would pertain to a - // previous block). - // - // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, which - // is relevant in the rare case where a triple is inserted with an `Id` for - // predicate that is not a new `Id`, but has not been used for a predicate in - // the original index. - // - // NOTE: Since we have already handled the case, where all IDs in the - // permutation are smaller, above, such a relation should exist. - Id searchId = - matchingBlock->_col0FirstId > id1 ? matchingBlock->_col0FirstId : id1; - const auto& it = meta._data.lower_bound(searchId); - AD_CORRECTNESS_CHECK(it != meta._data.end()); - Id id = it.getId(); - const auto& relationMetadata = meta.getMetaData(id); - size_t offsetBegin = relationMetadata._offsetInBlock; - size_t offsetEnd = offsetBegin + relationMetadata._numRows; - // Note: If the relation spans multiple blocks, we know that the block we - // found above contains only triples from that relation. - if (offsetBegin == std::numeric_limits::max()) { - offsetBegin = 0; - offsetEnd = blockTuples.size(); - } - AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); - AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); - - // If we have found `id1`, we can do a binary search in the portion of the - // block that pertains to it (note the special case mentioned above, where we - // are already at the beginning of the next block). - // - // Otherwise, `id` is the next larger ID and the position of the first triple - // of that relation is exactly the position we are looking for. - if (id == id1) { - locatedTriple.rowIndexInBlock = - std::lower_bound(blockTuples.begin() + offsetBegin, - blockTuples.begin() + offsetEnd, - std::array{id2, id3}, - [](const auto& a, const auto& b) { - return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); - }) - - blockTuples.begin(); - // Check if the triple at the found position is equal to `id1 id2 id3`. Note - // that our default for `existsInIndex` was set to `false` above. - const size_t& i = locatedTriple.rowIndexInBlock; - AD_CORRECTNESS_CHECK(i < blockTuples.size()); - if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { - locatedTriple.existsInIndex = true; - } - } else { - AD_CORRECTNESS_CHECK(id1 < id); - locatedTriple.rowIndexInBlock = offsetBegin; - } - - // Return the result. - return locatedTriple; -} diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index ce90b73145..a06f16dd79 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -8,7 +8,7 @@ #include "global/IdTriple.h" #include "index/Index.h" #include "index/IndexBuilderTypes.h" -#include "index/LocatedTriple.h" +#include "index/LocatedTriples.h" #include "parser/TurtleParser.h" #include "util/HashSet.h" @@ -102,19 +102,14 @@ class DeltaTriples { IdTriple getIdTriple(const TurtleTriple& turtleTriple); // Find the position of the given triple in the given permutation and add it - // to each of the six `LocatedTriplesPerBlock` maps (one per - // permutation). Return the iterators of where it was added (so that we can - // easily delete it again from these maps later). + // to each of the six `LocatedTriplesPerBlock` maps (one per permutation). + // Return the iterators of where it was added (so that we can easily delete it + // again from these maps later). // // TODO: The function is name is misleading, since this method does not only // locate, but also add to the mentioned data structures. LocatedTripleHandles locateTripleInAllPermutations(const IdTriple& idTriple); - // The implementation of the above function for a given permutation. - template - LocatedTriple locateTripleInPermutation(Id id1, Id id2, Id id3, - Permutation& permutation) const; - // Erase `LocatedTriple` object from each `LocatedTriplesPerBlock` list. The // argument are iterators for each list, as returned by the method // `locateTripleInAllPermutations` above. diff --git a/src/index/LocatedTriple.h b/src/index/LocatedTriple.h deleted file mode 100644 index 4d7b697fe6..0000000000 --- a/src/index/LocatedTriple.h +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright 2023, University of Freiburg -// Chair of Algorithms and Data Structures -// Authors: Hannah Bast - -#include "global/IdTriple.h" -#include "util/HashMap.h" - -#pragma once - -// Result record returned by `locateTripleInPermutation`. -// -// NOTE: This is currently more information then we need. In particular, the -// `blockIndex` is already implicit in `LocatedTriplesPerBlock` and the -// bit `existsInOriginalIndex_` can be derived using the information stored in -// a block and our metadata. However, both are useful for testing and for a -// small nuber of delta triples (think millions), the space efficiency of this -// class is not a significant issue. -struct LocatedTriple { - // The index of the block and the position within that block, where the - // triple "fits". - size_t blockIndex; - size_t rowIndexInBlock; - // The `Id`s of the triple in the order of the permutation. For example, - // for an object pertaining to the SPO permutation: `id1` is the subject, - // `id2` is the predicate, and `id3` is the object. - Id id1; - Id id2; - Id id3; - // Whether the triple exists in the original index or is new. - bool existsInIndex; -}; - -// All delta triples located at the same position in the original index. -// -// NOTE: A lambda does not work here because it has to be `static constexpr` -// and then I get a strange warning about "a field ... whose type uses the -// anonymous namespace". I also tried overloading `std::less`, but the -// required `namespace std { ... }` does not work at this point in the code, -// and I didn't want to have it somewhere else than here. -struct LocatedTripleCompare { - bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { - return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; - } -}; -using LocatedTriples = std::set; - -// Data structures with positions for a particular permutation. -class LocatedTriplesPerBlock { - private: - // The number of `LocatedTriple` objects stored. - size_t size_ = 0; - - public: - // Map from block index to position list. - // - // TODO: Keep the position list for each block index sorted (primary key: - // row index in block, secondary key: triple order). - // - // TODO: Should be private, but we want to iterate over it for testing. - ad_utility::HashMap map_; - - public: - // Get the positions for a given block index. Returns an empty list if there - // are no positions for that block index. - // - // TODO: Check if that is the behavior we want when actually using class - // `DeltaTriples` to augment the result of an index scan. - LocatedTriples getLocatedTriplesForBlock(size_t blockIndex) { - auto it = map_.find(blockIndex); - if (it != map_.end()) { - return it->second; - } else { - return {}; - } - } - - // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. - // Returns a handle to where it was added (via which we can easily remove it - // again if we need to). - LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { - LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; - auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); - AD_CORRECTNESS_CHECK(wasInserted == true); - AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); - ++size_; - return handle; - }; - - // Get the total number of `LocatedTriple` objects (for all blocks). - size_t size() const { return size_; } - - // Empty the data structure. - void clear() { - map_.clear(); - size_ = 0; - } -}; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp new file mode 100644 index 0000000000..cca49be166 --- /dev/null +++ b/src/index/LocatedTriples.cpp @@ -0,0 +1,294 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "index/LocatedTriples.h" + +#include + +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" +#include "index/Permutations.h" + +// ____________________________________________________________________________ +template +size_t LocatedTriplesPerBlock::numTriplesInBlockImpl(size_t blockIndex, Id id1, + Id id2) const { + size_t count = 0; + for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { + if constexpr (matchMode == MatchMode::MatchAll) { + ++count; + } else if constexpr (matchMode == MatchMode::MatchId1) { + count += (locatedTriple.id1 == id1); + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + count += (locatedTriple.id1 == id1 && locatedTriple.id2 == id2); + } + } + return count; +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex) const { + return numTriplesInBlockImpl(blockIndex); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex, + Id id1) const { + return numTriplesInBlockImpl(blockIndex, id1); +} + +// ____________________________________________________________________________ +size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex, Id id1, + Id id2) const { + return numTriplesInBlockImpl(blockIndex, id1, id2); +} + +// ____________________________________________________________________________ +void LocatedTriplesPerBlock::mergeTriplesIntoBlock(size_t blockIndex, + IdTable& idTable, size_t pos, + size_t len, int netGrowth) { + // If there are no triples in the given block, there is nothing to do. + if (!map_.contains(blockIndex)) { + AD_CONTRACT_CHECK(netGrowth == 0); + return; + } + const size_t numColumns = idTable.numColumns(); + + // Iterate backwards over the triples from the given block and process in + // groups of triples with the same `rowIndexInBlock`. + // + // NOTE: We have to keep track of the positions separately, since pointer + // arithmetic does not work on `LocatedTriples`, which is a `std::set`. That's + // also why we can't use `upper_bound` for searching. + const LocatedTriples& locatedTriples = map_.at(blockIndex); + auto locatedTriple = locatedTriples.rbegin(); + int shift = netGrowth; + size_t previousRowIndex = len; + while (locatedTriple != locatedTriples.rend()) { + // Search backwards for the next triple with a new `rowIndexInBlock`. Check + // that only the last triple in a group may already exist in the index (all + // triples with the same row index are <= the triple at that position in + // `idTable'). + size_t rowIndex = locatedTriple->rowIndexInBlock; + int groupNetGrowth = locatedTriple->existsInIndex ? -1 : 0; + auto nextLocatedTriple = locatedTriple; + ++nextLocatedTriple; + while (nextLocatedTriple != locatedTriples.rend() && + nextLocatedTriple->rowIndexInBlock == rowIndex) { + AD_CORRECTNESS_CHECK(nextLocatedTriple->existsInIndex == false); + ++groupNetGrowth; + ++nextLocatedTriple; + } + std::transform(locatedTriple, nextLocatedTriple, + std::ostream_iterator(std::cout, " "), + [](const LocatedTriple& lt) { + return absl::StrCat(lt.rowIndexInBlock, "[", + lt.existsInIndex, "]"); + }); + std::cout << "... shift = " << shift << ", net growth = " << groupNetGrowth + << std::endl; + + // If the last triple in the group exists in the index, don't copy that + // (recall that we are iterating in reverse order). By the way how `shift` + // is updated (see the `shift -= groupNetGrowth` below), the matching + // entry in `idTable` will be be overwritten. + if (locatedTriple->existsInIndex) { + ++locatedTriple; + } + + // Make space in `idTable` at the right place for the new group of triples + // and insert the new `Id`s (and overwrite those of the deleted triples). + // + // NOTE: If the `idTable` has two columns, we write `id2` and `id3`. If it + // has only one column, we only write `id3`. + AD_CONTRACT_CHECK(numColumns == 1 || numColumns == 2); + for (size_t colIndex = 0; colIndex < numColumns; ++colIndex) { + std::span column = idTable.getColumn(colIndex); + // Shifting left or right requires two different functions. + if (shift >= 0) { + std::shift_right(column.begin() + pos + rowIndex, + column.begin() + pos + previousRowIndex + shift, + shift); + } else { + std::shift_left(column.begin() + pos + rowIndex, + column.begin() + pos + previousRowIndex - shift, + -shift); + } + // Show the changed column after the shift. + if (1) { + std::cout << "Col #" << colIndex << ": "; + std::copy(column.begin(), column.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << " [after shift]" << std::endl; + } + // Add the new triples. + std::transform( + locatedTriple, nextLocatedTriple, + column.rbegin() + column.size() - pos - previousRowIndex - shift, + [&colIndex, &numColumns](const LocatedTriple& lt) { + return colIndex + 1 < numColumns ? lt.id2 : lt.id3; + }); + // For debugging only: null the gaps. + if (0) { + if (shift > 0 && shift > groupNetGrowth) { + for (int i = 0; i < shift - groupNetGrowth; ++i) { + column[pos + rowIndex + i] = Id::makeUndefined(); + } + } + } + // Show the changed column. + if (0) { + std::cout << "Col #" << colIndex << ": "; + std::copy(column.begin(), column.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << " [after add]" << std::endl; + } + } + + // Update for the next iteration. + previousRowIndex = rowIndex; + shift -= groupNetGrowth; + locatedTriple = nextLocatedTriple; + } + // AD_CORRECTNESS_CHECK(shift == 0); + // AD_CORRECTNESS_CHECK(std::is_sorted( + // idTable.begin() + pos, idTable.begin() + pos + len + numNewTriples)); + + // Do something to `idTable` to make the compiler happy. + // std::shift_right(idTable.begin() + pos, + // idTable.begin() + pos + len + numNewTriples, + // numNewTriples); +} + +// ____________________________________________________________________________ +template +LocatedTriple LocatedTriple::locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation) { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: With `_col2LastId` added to `CompressedBlockMetadata`, this can + // now be computed without having to decompress any blocks at this point. + // See the first revision of this branch for code, where blocks with equal + // `id1` and `id2` were decompressed to also check for `id3`. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block._col0LastId < triple[0]) { + return true; + } else if (block._col0LastId == triple[0]) { + if (block._col1LastId < triple[1]) { + return true; + } else if (block._col1LastId == triple[1]) { + return block._col2LastId < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // IDs, but still an invalid `rowIndexInBlock` and `existsInIndex` set to + // `false`. + LocatedTriple locatedTriple{ + blockIndex, std::numeric_limits::max(), id1, id2, id3, false}; + + // If all IDs from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and any position in the block + // (in the code that uses the result, that position will not be used in + // this case). + if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); + return locatedTriple; + } + + // Read and decompress the block. Note that we are potentially doing this + // a second time here (the block has probably already been looked at in + // the call to `std::lower_bound` above). + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Find the smallest "relation" ID that is not smaller than `id1` and get + // its metadata and the position of the first and last triple with that ID + // in the block. + // + // IMPORTANT FIX: If relation `id1` exists in the index, but our triple is + // larger than all triples of that relation in the index and the last + // triple of that relation ends a block, then our block search above + // (correctly) landed us at the next block. We can detect this by checking + // whether the first relation ID of the block is larger than `id1` and + // then we should get the metadata for the ID and not for `id1` (which + // would pertain to a previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, + // which is relevant in the rare case where a triple is inserted with an + // `Id` for predicate that is not a new `Id`, but has not been used for a + // predicate in the original index. + // + // NOTE: Since we have already handled the case, where all IDs in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->_col0FirstId > id1 ? matchingBlock->_col0FirstId : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata._offsetInBlock; + size_t offsetEnd = offsetBegin + relationMetadata._numRows; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where + // we are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger ID and the position of the first + // triple of that relation is exactly the position we are looking for. + if (id == id1) { + locatedTriple.rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. + // Note that our default for `existsInIndex` was set to `false` above. + const size_t& i = locatedTriple.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + locatedTriple.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + locatedTriple.rowIndexInBlock = offsetBegin; + } + + // Return the result. + return locatedTriple; +} + +// Explicit instantiation for the six permutation. +#define INSTANTIATE_LTIP(Permutation) \ + template LocatedTriple \ + LocatedTriple::locateTripleInPermutation(Id, Id, Id, \ + const Permutation&); +INSTANTIATE_LTIP(Permutation::PSO_T) +INSTANTIATE_LTIP(Permutation::POS_T) +INSTANTIATE_LTIP(Permutation::SPO_T) +INSTANTIATE_LTIP(Permutation::SOP_T) +INSTANTIATE_LTIP(Permutation::OPS_T) +INSTANTIATE_LTIP(Permutation::OSP_T) diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h new file mode 100644 index 0000000000..0dd97fe91e --- /dev/null +++ b/src/index/LocatedTriples.h @@ -0,0 +1,123 @@ +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Authors: Hannah Bast + +#include "engine/idTable/IdTable.h" +#include "global/IdTriple.h" +#include "util/HashMap.h" + +#pragma once + +// A triple and its location in a particular permutation. +// +// NOTE: Technically, `blockIndex` and the `existsInIndex` are redundant in this +// record because they can be derived when the clas is used. However, both are +// useful for testing and for a small nuber of delta triples (think millions), +// the space efficiency of this class is not a significant issue. +struct LocatedTriple { + // The index of the block and the position within that block, where the + // triple "fits". + size_t blockIndex; + size_t rowIndexInBlock; + // The `Id`s of the triple in the order of the permutation. For example, + // for an object pertaining to the SPO permutation: `id1` is the subject, + // `id2` is the predicate, and `id3` is the object. + Id id1; + Id id2; + Id id3; + // True iff the triple exists in the permutation (then it is equal to the + // triple at the position given by `blockIndex` and `rowIndexInBlock`). + bool existsInIndex; + + // Locate the given triple in the given permutation. + template + static LocatedTriple locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation); +}; + +// A sorted set of triples located at the same position in a particular +// permutation. Note that we could also overload `std::less` here. +struct LocatedTripleCompare { + bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { + return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; + } +}; +using LocatedTriples = std::set; + +// A sorted set of triples located in particular permutation, grouped by block. +class LocatedTriplesPerBlock { + private: + // The total number of `LocatedTriple` objects stored (for all blocks). + size_t numTriples_ = 0; + + public: + // Map with the list of triples per block. + // + // TODO: Should be private. Should we make `LocatedTriplesPerBlock` a subclass + // of `HashMap` or is that bad style? + ad_utility::HashMap map_; + + public: + // Get the number of located triples for the given block and that match the + // `id1` (if provided) and `id2` (if provided). + size_t numTriplesInBlock(size_t blockIndex) const; + size_t numTriplesInBlock(size_t blockIndex, Id id1) const; + size_t numTriplesInBlock(size_t blockIndex, Id id1, Id id2) const; + + // Merge the located triples for the given block into the given `IdTable` + // segment. It is the resposibility of the caller that there is space for + // `numNewTriples` triples starting from `end`. Like for `numTriplesInBlock` + // above, consider only triples that match `id1` (if provided) and `id2` (if + // provided). + void mergeTriplesIntoBlock(size_t blockIndex, IdTable& idTable, size_t pos, + size_t len, int netGrowth); + void mergeTriplesIntoBlock(size_t blockIndex, size_t rowIndexOffset, Id id1, + IdTable& idTable, size_t pos, size_t end, + int netGrowth); + void mergeTriplesIntoBlock(size_t blockIndex, size_t rowIndexOffset, Id id1, + Id id2, IdTable& idTable, size_t pos, size_t end, + int netGrowth); + + // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. + // Returns a handle to where it was added (via which we can easily remove it + // again if we need to). + LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { + LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; + auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); + AD_CORRECTNESS_CHECK(wasInserted == true); + AD_CORRECTNESS_CHECK(handle != locatedTriples.end()); + ++numTriples_; + return handle; + }; + + // Get the total number of `LocatedTriple` objects (for all blocks). + size_t numTriples() const { return numTriples_; } + + // Get the number of blocks containing `LocatedTriple` objects. + size_t numBlocks() const { return map_.size(); } + + // Empty the data structure. + void clear() { + map_.clear(); + numTriples_ = 0; + } + + private: + // Match modes for `numTriplesInBlockImpl` and `mergeTriplesIntoBlockImpl`. + enum struct MatchMode { MatchAll, MatchId1, MatchId1AndId2 }; + + // The Implementation behind the public method `numTriplesInBlock` above. + template + size_t numTriplesInBlockImpl(size_t blockIndex, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; + + // The Implementation behind the public method `mergeTriplesIntoBlock` above. + // The only reason that the arguments `id1` and `id2` come at the end here is + // so that we can give them default values. + template + void mergeTriplesIntoBlock(size_t blockIndex, IdTable::iterator begin, + IdTable::iterator end, size_t numNewTriples, + size_t rowIndexOffset = 0, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; +}; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index b473f4ee01..766b0edc8d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -106,6 +106,8 @@ addLinkAndDiscoverTestSerial(IndexTest index) addLinkAndDiscoverTestSerial(DeltaTriplesTest index) +addLinkAndDiscoverTestSerial(LocatedTriplesTest index) + addLinkAndDiscoverTest(FTSAlgorithmsTest index) addLinkAndDiscoverTest(EngineTest engine) diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 98e6d6d678..052c74ee65 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -103,9 +103,9 @@ class DeltaTriplesTest : public ::testing::Test { void checkTriplesWithPositionsPerBlockSize(const DeltaTriples& deltaTriples, size_t expectedSize) { for (Index::Permutation permutation : permutationEnums) { - ASSERT_EQ( - deltaTriples.getTriplesWithPositionsPerBlock(permutation).size(), - expectedSize); + ASSERT_EQ(deltaTriples.getTriplesWithPositionsPerBlock(permutation) + .numTriples(), + expectedSize); } } diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp new file mode 100644 index 0000000000..2647269476 --- /dev/null +++ b/test/LocatedTriplesTest.cpp @@ -0,0 +1,96 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Hannah Bast + +#include + +#include "./util/IdTableHelpers.h" +#include "./util/IdTestHelpers.h" +#include "index/LocatedTriples.h" + +// TODO: Why the namespace here? (copied from `test/IndexMetaDataTest.cpp`) +namespace { +auto V = ad_utility::testing::VocabId; +} + +// Fixture that ... TODO:explain. +class LocatedTriplesTest : public ::testing::Test { + protected: + // Make `LocatedTriplesPerBlock` from a list of `LocatedTriple` objects (the + // order in which the objects are given does not matter). + LocatedTriplesPerBlock makeLocatedTriplesPerBlock( + std::vector locatedTriples) { + LocatedTriplesPerBlock result; + for (auto locatedTriple : locatedTriples) { + result.add(locatedTriple); + } + return result; + } +}; + +// Test the method that counts the number of `LocatedTriple's in a block. +TEST_F(LocatedTriplesTest, numTriplesInBlock) { + // Set up lists of located triples for three blocks. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(10), V(1), V(0), true}, + LocatedTriple{1, 0, V(10), V(2), V(1), true}, + LocatedTriple{1, 0, V(11), V(3), V(0), true}, + LocatedTriple{2, 0, V(20), V(4), V(0), true}, + LocatedTriple{2, 0, V(21), V(5), V(0), true}, + LocatedTriple{3, 0, V(30), V(6), V(0), true}, + LocatedTriple{3, 0, V(32), V(7), V(0), true}}); + ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + + // Check the total counts per block. + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3), 2); + + // Check the counts per block for a given `id1`. + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10)), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(11)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(20)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(21)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(30)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(32)), 1); + + // Check the counts per block for a given `id1` and `id2`. + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10), V(1)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10), V(2)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(11), V(3)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(20), V(4)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(21), V(5)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(30), V(6)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(32), V(7)), 1); +} + +// Test the method that merges the matching `LocatedTriple`s from a block into a +// part of an `IdTable`. +TEST_F(LocatedTriplesTest, mergeTriplesIntoBlock) { + // An `IdTable` with two columns that could be the result of reading a single + // block in an index scan. + IdTable idTable = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // Set up a list of located triples for a single block, at various positions + // in the block. + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{1, 0, V(1), V(10), V(10), true}, // Delete row 0 + LocatedTriple{1, 1, V(1), V(10), V(11), false}, // Insert before row 1 + LocatedTriple{1, 1, V(1), V(11), V(10), false}, // Insert before row 1 + LocatedTriple{1, 4, V(1), V(21), V(11), false}, // Insert before row 4 + LocatedTriple{1, 4, V(1), V(30), V(10), false}, // Insert before row 4 + LocatedTriple{1, 4, V(1), V(30), V(20), true}, // Delete row 4 + LocatedTriple{1, 5, V(1), V(30), V(30), true}}); // Delete row 5 + + // Merge all these triples into the whole table. Since four triples are added + // and three triples are deleted, the net growth is one triple. + size_t netGrowth = 1; + idTable.resize(idTable.size() + netGrowth); + locatedTriplesPerBlock.mergeTriplesIntoBlock(1, idTable, 0, 6, netGrowth); +} diff --git a/test/ValueIdTest.cpp b/test/ValueIdTest.cpp index 6111e06ba4..08a4d7f58d 100644 --- a/test/ValueIdTest.cpp +++ b/test/ValueIdTest.cpp @@ -278,12 +278,12 @@ TEST(ValueId, toDebugString) { stream << id; ASSERT_EQ(stream.str(), expected); }; - test(ValueId::makeUndefined(), "Undefined:Undefined"); - test(ValueId::makeFromInt(-42), "Int:-42"); - test(ValueId::makeFromDouble(42.0), "Double:42.000000"); - test(makeVocabId(15), "VocabIndex:15"); - test(makeLocalVocabId(25), "LocalVocabIndex:25"); - test(makeTextRecordId(37), "TextRecordIndex:37"); + test(ValueId::makeUndefined(), "U:xx"); + test(ValueId::makeFromInt(-42), "I:-42"); + test(ValueId::makeFromDouble(42.0), "D:42.000000"); + test(makeVocabId(15), "V:15"); + test(makeLocalVocabId(25), "L:25"); + test(makeTextRecordId(37), "T:37"); } TEST(ValueId, TriviallyCopyable) { diff --git a/test/ValuesForTestingTest.cpp b/test/ValuesForTestingTest.cpp index e9c60479ab..13f29be131 100644 --- a/test/ValuesForTestingTest.cpp +++ b/test/ValuesForTestingTest.cpp @@ -27,8 +27,8 @@ TEST(ValuesForTesting, valuesForTesting) { ASSERT_EQ(v.getMultiplicity(1), 84.0); ASSERT_THAT(v.asString(), - ::testing::StartsWith("Values for testing with 2 columns and " - "contents VocabIndex:3 VocabIndex:12")); + ::testing::StartsWith( + "Values for testing with 2 columns and contents V:3 V:12")); ASSERT_EQ(v.getDescriptor(), "explicit values for testing"); ASSERT_TRUE(v.resultSortedOn().empty()); ASSERT_TRUE(v.getChildren().empty()); From 10c74f04de9331f1cfbc781deaf53a215018ed0e Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 1 Apr 2023 16:58:00 +0200 Subject: [PATCH 10/20] Method `mergeTriples` with good unit tests There is now a method `LocatedTriplesPerBlock::mergeTriples` that merges the delta triple into a given (possibly partial) block. There are unit tests for the following cases: full block with unrestricted `id1` and `id2`, full block with restricted `id1`, patial block with restricetd `id1`, partial block with restricted `id1` and `id2`, and the latter with only a single column. Removed the previous complicated version that tried to do it in place. --- src/index/LocatedTriples.cpp | 210 ++++++++++++++++------------------- src/index/LocatedTriples.h | 54 +++++---- test/LocatedTriplesTest.cpp | 130 ++++++++++++++++------ 3 files changed, 218 insertions(+), 176 deletions(-) diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index cca49be166..0046b587f9 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -12,8 +12,8 @@ // ____________________________________________________________________________ template -size_t LocatedTriplesPerBlock::numTriplesInBlockImpl(size_t blockIndex, Id id1, - Id id2) const { +size_t LocatedTriplesPerBlock::numTriplesImpl(size_t blockIndex, Id id1, + Id id2) const { size_t count = 0; for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { if constexpr (matchMode == MatchMode::MatchAll) { @@ -28,137 +28,117 @@ size_t LocatedTriplesPerBlock::numTriplesInBlockImpl(size_t blockIndex, Id id1, } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex) const { - return numTriplesInBlockImpl(blockIndex); +size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex) const { + return numTriplesImpl(blockIndex); } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex, - Id id1) const { - return numTriplesInBlockImpl(blockIndex, id1); +size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex, Id id1) const { + return numTriplesImpl(blockIndex, id1); } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriplesInBlock(size_t blockIndex, Id id1, - Id id2) const { - return numTriplesInBlockImpl(blockIndex, id1, id2); +size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex, Id id1, + Id id2) const { + return numTriplesImpl(blockIndex, id1, id2); } // ____________________________________________________________________________ -void LocatedTriplesPerBlock::mergeTriplesIntoBlock(size_t blockIndex, - IdTable& idTable, size_t pos, - size_t len, int netGrowth) { - // If there are no triples in the given block, there is nothing to do. - if (!map_.contains(blockIndex)) { - AD_CONTRACT_CHECK(netGrowth == 0); - return; - } - const size_t numColumns = idTable.numColumns(); +template +void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + const IdTable& block, IdTable& result, + size_t offsetInResult, + size_t offsetOfBlock, Id id1, + Id id2) const { + // This method should only be called, if located triples in that block exist. + AD_CONTRACT_CHECK(map_.contains(blockIndex)); - // Iterate backwards over the triples from the given block and process in - // groups of triples with the same `rowIndexInBlock`. - // - // NOTE: We have to keep track of the positions separately, since pointer - // arithmetic does not work on `LocatedTriples`, which is a `std::set`. That's - // also why we can't use `upper_bound` for searching. - const LocatedTriples& locatedTriples = map_.at(blockIndex); - auto locatedTriple = locatedTriples.rbegin(); - int shift = netGrowth; - size_t previousRowIndex = len; - while (locatedTriple != locatedTriples.rend()) { - // Search backwards for the next triple with a new `rowIndexInBlock`. Check - // that only the last triple in a group may already exist in the index (all - // triples with the same row index are <= the triple at that position in - // `idTable'). - size_t rowIndex = locatedTriple->rowIndexInBlock; - int groupNetGrowth = locatedTriple->existsInIndex ? -1 : 0; - auto nextLocatedTriple = locatedTriple; - ++nextLocatedTriple; - while (nextLocatedTriple != locatedTriples.rend() && - nextLocatedTriple->rowIndexInBlock == rowIndex) { - AD_CORRECTNESS_CHECK(nextLocatedTriple->existsInIndex == false); - ++groupNetGrowth; - ++nextLocatedTriple; + // TODO: For now, only implemented for two columns (easy to extend). + AD_CORRECTNESS_CHECK(block.numColumns() == 2); + + AD_CONTRACT_CHECK(block.numColumns() == result.numColumns()); + auto resultEntry = result.begin() + offsetInResult; + const auto& locatedTriples = map_.at(blockIndex); + auto locatedTriple = locatedTriples.begin(); + + // Helper lambda that checks whether the given located triple should be + // considered, given the `matchMode`. + auto locatedTripleMatches = [&]() { + if constexpr (matchMode == MatchMode::MatchAll) { + return true; + } else if constexpr (matchMode == MatchMode::MatchId1) { + return locatedTriple->id1 == id1; + } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { + return locatedTriple->id1 == id1 && locatedTriple->id2 == id2; } - std::transform(locatedTriple, nextLocatedTriple, - std::ostream_iterator(std::cout, " "), - [](const LocatedTriple& lt) { - return absl::StrCat(lt.rowIndexInBlock, "[", - lt.existsInIndex, "]"); - }); - std::cout << "... shift = " << shift << ", net growth = " << groupNetGrowth - << std::endl; + }; + + // Skip located triples that come before `offsetOfBlock` because this may be a + // partial block. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock < offsetOfBlock) { + ++locatedTriple; + } - // If the last triple in the group exists in the index, don't copy that - // (recall that we are iterating in reverse order). By the way how `shift` - // is updated (see the `shift -= groupNetGrowth` below), the matching - // entry in `idTable` will be be overwritten. - if (locatedTriple->existsInIndex) { + // Iterate over the input block. Keep track of the row index, which is + // `offsetInBlock` for the first element of the block. + size_t rowIndex = offsetOfBlock; + for (const auto& blockEntry : block) { + // Append triples that are marked for insertion at this position to the + // result. + while (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == false) { + if (locatedTripleMatches()) { + (*resultEntry)[0] = locatedTriple->id2; + (*resultEntry)[1] = locatedTriple->id3; + ++resultEntry; + } ++locatedTriple; } - // Make space in `idTable` at the right place for the new group of triples - // and insert the new `Id`s (and overwrite those of the deleted triples). - // - // NOTE: If the `idTable` has two columns, we write `id2` and `id3`. If it - // has only one column, we only write `id3`. - AD_CONTRACT_CHECK(numColumns == 1 || numColumns == 2); - for (size_t colIndex = 0; colIndex < numColumns; ++colIndex) { - std::span column = idTable.getColumn(colIndex); - // Shifting left or right requires two different functions. - if (shift >= 0) { - std::shift_right(column.begin() + pos + rowIndex, - column.begin() + pos + previousRowIndex + shift, - shift); - } else { - std::shift_left(column.begin() + pos + rowIndex, - column.begin() + pos + previousRowIndex - shift, - -shift); - } - // Show the changed column after the shift. - if (1) { - std::cout << "Col #" << colIndex << ": "; - std::copy(column.begin(), column.end(), - std::ostream_iterator(std::cout, " ")); - std::cout << " [after shift]" << std::endl; - } - // Add the new triples. - std::transform( - locatedTriple, nextLocatedTriple, - column.rbegin() + column.size() - pos - previousRowIndex - shift, - [&colIndex, &numColumns](const LocatedTriple& lt) { - return colIndex + 1 < numColumns ? lt.id2 : lt.id3; - }); - // For debugging only: null the gaps. - if (0) { - if (shift > 0 && shift > groupNetGrowth) { - for (int i = 0; i < shift - groupNetGrowth; ++i) { - column[pos + rowIndex + i] = Id::makeUndefined(); - } - } - } - // Show the changed column. - if (0) { - std::cout << "Col #" << colIndex << ": "; - std::copy(column.begin(), column.end(), - std::ostream_iterator(std::cout, " ")); - std::cout << " [after add]" << std::endl; - } + // Append the triple at this position to the result if and only if it is + // marked for deletion and matches (also skip it if it doesn't match). + bool deleteBlockEntry = false; + if (locatedTriple != locatedTriples.end() && + locatedTriple->rowIndexInBlock == rowIndex && + locatedTriple->existsInIndex == true) { + deleteBlockEntry = locatedTripleMatches(); + ++locatedTriple; + } + if (!deleteBlockEntry) { + *resultEntry++ = blockEntry; } - // Update for the next iteration. - previousRowIndex = rowIndex; - shift -= groupNetGrowth; - locatedTriple = nextLocatedTriple; - } - // AD_CORRECTNESS_CHECK(shift == 0); - // AD_CORRECTNESS_CHECK(std::is_sorted( - // idTable.begin() + pos, idTable.begin() + pos + len + numNewTriples)); + // Update `rowIndex` for the next `blockEntry`. + ++rowIndex; + }; +} - // Do something to `idTable` to make the compiler happy. - // std::shift_right(idTable.begin() + pos, - // idTable.begin() + pos + len + numNewTriples, - // numNewTriples); +// ____________________________________________________________________________ +void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + const IdTable& block, IdTable& result, + size_t offsetInResult) const { + mergeTriples(blockIndex, block, result, offsetInResult); +} + +// ____________________________________________________________________________ +void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + const IdTable& block, IdTable& result, + size_t offsetInResult, + size_t rowIndexOffset, Id id1) const { + mergeTriples(blockIndex, block, result, offsetInResult, + rowIndexOffset, id1); +} + +// ____________________________________________________________________________ +void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + const IdTable& block, IdTable& result, + size_t offsetInResult, + size_t rowIndexOffset, Id id1, + Id id2) const { + mergeTriples( + blockIndex, block, result, offsetInResult, rowIndexOffset, id1, id2); } // ____________________________________________________________________________ diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index 0dd97fe91e..8e5dab090d 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -60,23 +60,30 @@ class LocatedTriplesPerBlock { public: // Get the number of located triples for the given block and that match the // `id1` (if provided) and `id2` (if provided). - size_t numTriplesInBlock(size_t blockIndex) const; - size_t numTriplesInBlock(size_t blockIndex, Id id1) const; - size_t numTriplesInBlock(size_t blockIndex, Id id1, Id id2) const; - - // Merge the located triples for the given block into the given `IdTable` - // segment. It is the resposibility of the caller that there is space for - // `numNewTriples` triples starting from `end`. Like for `numTriplesInBlock` - // above, consider only triples that match `id1` (if provided) and `id2` (if - // provided). - void mergeTriplesIntoBlock(size_t blockIndex, IdTable& idTable, size_t pos, - size_t len, int netGrowth); - void mergeTriplesIntoBlock(size_t blockIndex, size_t rowIndexOffset, Id id1, - IdTable& idTable, size_t pos, size_t end, - int netGrowth); - void mergeTriplesIntoBlock(size_t blockIndex, size_t rowIndexOffset, Id id1, - Id id2, IdTable& idTable, size_t pos, size_t end, - int netGrowth); + size_t numTriples(size_t blockIndex) const; + size_t numTriples(size_t blockIndex, Id id1) const; + size_t numTriples(size_t blockIndex, Id id1, Id id2) const; + + // Merge the located triples for `blockIndex` into the given `block` (which + // might be the whole block with that index or just a part of it) and write + // the result to `result`, starting from position `offsetInResult`. + // + // It is the resposibility of the caller that there is enough space or the + // result starting from that offset. Like for `numTriplesInBlock` above, + // consider only triples that match `id1` (if provided) and `id2` (if + // provided). If `block` is just a part of an index block, the first triple of + // block has row index `rowIndexOffset` in the original block. + // + // TODO: Beware of triples inserted at the end of the block, they are found in + // the `LocatedTriples` for `blockIndex + 1`. It's up to `CompressedRelation` + // to handle that correctly. + void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, + size_t offsetInResult) const; + void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, + size_t offsetInResult, size_t rowIndexOffset, Id id1) const; + void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, + size_t offsetInResult, size_t rowIndexOffset, Id id1, + Id id2) const; // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. // Returns a handle to where it was added (via which we can easily remove it @@ -108,16 +115,15 @@ class LocatedTriplesPerBlock { // The Implementation behind the public method `numTriplesInBlock` above. template - size_t numTriplesInBlockImpl(size_t blockIndex, Id id1 = Id::makeUndefined(), - Id id2 = Id::makeUndefined()) const; + size_t numTriplesImpl(size_t blockIndex, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; // The Implementation behind the public method `mergeTriplesIntoBlock` above. // The only reason that the arguments `id1` and `id2` come at the end here is // so that we can give them default values. template - void mergeTriplesIntoBlock(size_t blockIndex, IdTable::iterator begin, - IdTable::iterator end, size_t numNewTriples, - size_t rowIndexOffset = 0, - Id id1 = Id::makeUndefined(), - Id id2 = Id::makeUndefined()) const; + void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, + size_t offsetInResult, size_t rowIndexOffset = 0, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; }; diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 2647269476..00d952e990 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -43,54 +43,110 @@ TEST_F(LocatedTriplesTest, numTriplesInBlock) { ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); // Check the total counts per block. - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1), 3); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2), 2); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), 3); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), 2); // Check the counts per block for a given `id1`. - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10)), 2); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(11)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(20)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(21)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(30)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(32)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), 1); // Check the counts per block for a given `id1` and `id2`. - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10), V(1)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(10), V(2)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(1, V(11), V(3)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(20), V(4)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(2, V(21), V(5)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(30), V(6)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriplesInBlock(3, V(32), V(7)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), 1); } // Test the method that merges the matching `LocatedTriple`s from a block into a // part of an `IdTable`. -TEST_F(LocatedTriplesTest, mergeTriplesIntoBlock) { - // An `IdTable` with two columns that could be the result of reading a single - // block in an index scan. - IdTable idTable = makeIdTableFromVector({{10, 10}, // Row 0 - {15, 20}, // Row 1 - {15, 30}, // Row 2 - {20, 10}, // Row 3 - {30, 20}, // Row 4 - {30, 30}}); // Row 5 +TEST_F(LocatedTriplesTest, mergeTriples) { + // A block, as it could come from an index scan. + IdTable block = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 - // Set up a list of located triples for a single block, at various positions - // in the block. + // A set of located triples for that block. auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( {LocatedTriple{1, 0, V(1), V(10), V(10), true}, // Delete row 0 LocatedTriple{1, 1, V(1), V(10), V(11), false}, // Insert before row 1 - LocatedTriple{1, 1, V(1), V(11), V(10), false}, // Insert before row 1 - LocatedTriple{1, 4, V(1), V(21), V(11), false}, // Insert before row 4 - LocatedTriple{1, 4, V(1), V(30), V(10), false}, // Insert before row 4 - LocatedTriple{1, 4, V(1), V(30), V(20), true}, // Delete row 4 - LocatedTriple{1, 5, V(1), V(30), V(30), true}}); // Delete row 5 + LocatedTriple{1, 1, V(2), V(11), V(10), false}, // Insert before row 1 + LocatedTriple{1, 4, V(2), V(21), V(11), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(10), false}, // Insert before row 4 + LocatedTriple{1, 4, V(2), V(30), V(20), true}, // Delete row 4 + LocatedTriple{1, 5, V(3), V(30), V(30), true}}); // Delete row 5 - // Merge all these triples into the whole table. Since four triples are added - // and three triples are deleted, the net growth is one triple. - size_t netGrowth = 1; - idTable.resize(idTable.size() + netGrowth); - locatedTriplesPerBlock.mergeTriplesIntoBlock(1, idTable, 0, 6, netGrowth); + // Merge all these triples into `block` and check that the result is as + // expected (four triples inserted and three triples deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 11}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}}); // Row 6 + IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block, result, 0); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` into `block` (three triples + // inserted and one triple deleted). + { + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {11, 10}, // Row 1 + {15, 20}, // Row 2 + {15, 30}, // Row 3 + {20, 10}, // Row 4 + {21, 11}, // Row 5 + {30, 10}, // Row 6 + {30, 30}}); // Row 7 + IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, block, result, 0, 0, V(2)); + ASSERT_EQ(result, resultExpected); + } + + // Repeat but with a partial block that leaves out the first two elements of + // `block` (and correspondingly `offsetOfBlock == 2` in `mergeTriples`). + IdTable blockTruncated = block.clone(); + std::shift_left(blockTruncated.begin(), blockTruncated.end(), 2); + blockTruncated.resize(block.size() - 2); + { + IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 + {20, 10}, // Row 1 + {21, 11}, // Row 2 + {30, 10}, // Row 3 + {30, 30}}); // Row 4 + IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 2, V(2)); + ASSERT_EQ(result, resultExpected); + } + + // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the same + // truncated block as above (one triples inserted, one triple deleted). + { + IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 + {20, 10}, // Row 1 + {30, 10}, // Row 2 + {30, 30}}); // Row 3 + IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 2, V(2), + V(30)); + ASSERT_EQ(result, resultExpected); + } } From fa49a33eab2cd11429bffc0c4c61cc47aff45485 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 2 Apr 2023 16:46:22 +0200 Subject: [PATCH 11/20] Consider delta triples for index scans with two variables Implemented it and wrote some unit tests. TODO: If the first block of the scan is incomplete, delta triples are currently ignored (with a warning). There is no principle problem to add this, but this first needs some refactoring of the original code to avoid code duplication. --- src/index/CompressedRelation.cpp | 213 +++++++++++++++++++++++-------- src/index/CompressedRelation.h | 5 +- src/index/IndexMetaData.h | 14 +- src/index/LocatedTriples.cpp | 59 ++++++--- src/index/LocatedTriples.h | 16 ++- src/index/Permutations.h | 32 +++-- src/util/AllocatorWithLimit.h | 4 +- test/CompressedRelationsTest.cpp | 8 +- test/LocatedTriplesTest.cpp | 192 +++++++++++++++++++++++----- test/util/AllocatorTestHelpers.h | 10 +- 10 files changed, 415 insertions(+), 138 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index cd2933ab88..596f3326ae 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -5,6 +5,7 @@ #include "CompressedRelation.h" #include "engine/idTable/IdTable.h" +#include "util/AllocatorWithLimit.h" #include "util/Cache.h" #include "util/CompressionUsingZstd/ZstdWrapper.h" #include "util/ConcurrentCache.h" @@ -15,8 +16,8 @@ using namespace std::chrono_literals; // ____________________________________________________________________________ void CompressedRelationReader::scan( - const CompressedRelationMetadata& metadata, - const vector& blockMetadata, + const CompressedRelationMetadata& metadataForRelation, + const vector& metadataForAllBlocks, ad_utility::File& file, IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { @@ -27,44 +28,22 @@ void CompressedRelationReader::scan( Id _col0FirstId; Id _col0LastId; }; - Id col0Id = metadata._col0Id; + Id col0Id = metadataForRelation._col0Id; // TODO Use a structured binding. Structured bindings are // currently not supported by clang when using OpenMP because clang internally // transforms the `#pragma`s into lambdas, and capturing structured bindings // is only supported in clang >= 16. - decltype(blockMetadata.begin()) beginBlock, endBlock; + decltype(metadataForAllBlocks.begin()) beginBlock, endBlock; std::tie(beginBlock, endBlock) = std::equal_range( // TODO For some reason we can't use `std::ranges::equal_range`, // find out why. Note: possibly it has something to do with the limited // support of ranges in clang with versions < 16. Revisit this when // we use clang 16. - blockMetadata.begin(), blockMetadata.end(), KeyLhs{col0Id, col0Id}, - [](const auto& a, const auto& b) { + metadataForAllBlocks.begin(), metadataForAllBlocks.end(), + KeyLhs{col0Id, col0Id}, [](const auto& a, const auto& b) { return a._col0FirstId < b._col0FirstId && a._col0LastId < b._col0LastId; }); - // PRELIMINARY: Say how many delta triples are contained in those blocks. - size_t numDeltaTriples = 0; - for (auto block = beginBlock; block < endBlock; ++block) { - size_t blockIndex = block - blockMetadata.begin(); - if (locatedTriplesPerBlock.map_.contains(blockIndex)) { - numDeltaTriples += locatedTriplesPerBlock.map_.at(blockIndex).size(); - } - } - LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples - << std::endl; - - // The total size of the result is now known. - result->resize(metadata.getNofElements()); - - // The position in the result to which the next block is being - // decompressed. - size_t rowIndexOfNextBlock = 0; - - // The number of rows for which we still have space - // in the result (only needed for checking of invariants). - size_t spaceLeft = result->size(); - // The first block might contain entries that are not part of our // actual scan result. bool firstBlockIsIncomplete = @@ -81,10 +60,53 @@ void CompressedRelationReader::scan( AD_CORRECTNESS_CHECK(!firstBlockIsIncomplete || (beginBlock == lastBlock)); AD_CORRECTNESS_CHECK(!lastBlockIsIncomplete); if (firstBlockIsIncomplete) { - AD_CORRECTNESS_CHECK(metadata._offsetInBlock != + AD_CORRECTNESS_CHECK(metadataForRelation._offsetInBlock != std::numeric_limits::max()); } + // Compute the numer of inserted and deleted triples per block and overall. + // note the `<=` so that we don't forget the block beyond the last (which may + // have information about delta triples at the vey end of a relation). + std::vector> numInsAndDelPerBlock; + size_t numInsTotal = 0; + size_t numDelTotal = 0; + for (auto block = beginBlock; block <= endBlock; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto [numIns, numDel] = + block == beginBlock || block == endBlock + ? locatedTriplesPerBlock.numTriples(blockIndex, col0Id) + : locatedTriplesPerBlock.numTriples(blockIndex); + numInsTotal += numIns; + numDelTotal += numDel; + numInsAndDelPerBlock.push_back({numIns, numDel}); + } + if (numInsTotal > 0 || numDelTotal > 0) { + LOG(INFO) << "Index scan with delta triples: #inserts = " << numInsTotal + << ", #deletes = " << numDelTotal + << ", #blocks = " << (endBlock - beginBlock) << std::endl; + AD_CORRECTNESS_CHECK(numDelTotal < metadataForRelation.getNofElements()); + } + + // TODO: For now only consider delta triples in complete blocks. + if (firstBlockIsIncomplete) { + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() == 1); + numInsTotal = 0; + numDelTotal = 0; + LOG(WARN) << "Delta triples in incomplete block ignored!" << std::endl; + } + + // The total size of the result is now known. + result->resize(metadataForRelation.getNofElements() + numInsTotal - + numDelTotal); + + // The position in the result to which the next block is being + // decompressed. + size_t rowIndexOfNextBlock = 0; + + // The number of rows for which we still have space + // in the result (only needed for checking of invariants). + size_t spaceLeft = result->size(); + // We have at most one block that is incomplete and thus requires trimming. // Set up a lambda, that reads this block and decompresses it to // the result. @@ -100,12 +122,12 @@ void CompressedRelationReader::scan( ._resultPointer; // Extract the part of the block that actually belongs to the relation - auto numElements = metadata._numRows; + auto numElements = metadataForRelation._numRows; AD_CORRECTNESS_CHECK(uncompressedBuffer->numColumns() == - metadata.numColumns()); + metadataForRelation.numColumns()); for (size_t i = 0; i < uncompressedBuffer->numColumns(); ++i) { const auto& inputCol = uncompressedBuffer->getColumn(i); - auto begin = inputCol.begin() + metadata._offsetInBlock; + auto begin = inputCol.begin() + metadataForRelation._offsetInBlock; auto resultColumn = result->getColumn(i); AD_CORRECTNESS_CHECK(numElements <= spaceLeft); std::copy(begin, begin + numElements, resultColumn.begin()); @@ -123,30 +145,38 @@ void CompressedRelationReader::scan( } } - // Read all the other (complete!) blocks in parallel + // Process all the other (complete) blocks. The compressed blocks are read + // sequentially from disk and then decompressed in parallel. + const size_t blockIndexBegin = beginBlock - metadataForAllBlocks.begin(); + size_t blockIndex = blockIndexBegin; if (beginBlock < endBlock) { #pragma omp parallel #pragma omp single { for (; beginBlock < endBlock; ++beginBlock) { const auto& block = *beginBlock; - // Read a block from disk (serially). + std::pair numInsAndDel = + numInsAndDelPerBlock.at(blockIndex - blockIndexBegin); + // Read the compressed block from disk (sequentially). CompressedBlock compressedBuffer = readCompressedBlockFromFile(block, file, std::nullopt); // This lambda decompresses the block that was just read to the // correct position in the result. auto decompressLambda = [&result, rowIndexOfNextBlock, &block, + &numInsAndDel, &locatedTriplesPerBlock, + &blockIndex, compressedBuffer = std::move(compressedBuffer)]() { ad_utility::TimeBlockAndLog tbl{"Decompressing a block"}; - decompressBlockToExistingIdTable(compressedBuffer, block._numRows, - *result, rowIndexOfNextBlock); + decompressBlockToExistingIdTable( + compressedBuffer, block._numRows, *result, rowIndexOfNextBlock, + numInsAndDel, locatedTriplesPerBlock, blockIndex); }; - // The `decompressLambda` can now run in parallel + // This `decompressLambda` can run concurrently. #pragma omp task { if (!timer || !timer->wlock()->hasTimedOut()) { @@ -154,14 +184,55 @@ void CompressedRelationReader::scan( }; } - // this is again serial code, set up the correct pointers - // for the next block; - spaceLeft -= block._numRows; - rowIndexOfNextBlock += block._numRows; + // This is again serial code, which sets up the correct pointers for the + // next block. + AD_CORRECTNESS_CHECK(numInsAndDel.second <= block._numRows); + size_t numRowsOfThisBlock = + block._numRows + numInsAndDel.first - numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); + spaceLeft -= numRowsOfThisBlock; + rowIndexOfNextBlock += numRowsOfThisBlock; + ++blockIndex; + } + } + // End of omp parallel region, all blocks are decompressed now. + } + + // Check whether there are relevant delta triples in the next block. If yes, + // these must all come contiguously at the very beginning of that block, have + // `rowIndexInBlock == std::numeric_limits::max()` and `id1 == col0Id` + // and must all be inserts. + // + // TODO: This should be a separate function (of `LocatedTriplesPerBlock`?). + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() >= 1); + size_t numIns = numInsAndDelPerBlock.back().first; + if (numIns > 0) { + // LOG(INFO) << "Triples to be inserted after last block" << std::endl; + // LOG(INFO) << "numInsAndDel.first: " << numIns << std::endl; + AD_CORRECTNESS_CHECK(result->numRows() >= rowIndexOfNextBlock + numIns); + AD_CORRECTNESS_CHECK(locatedTriplesPerBlock.map_.contains(blockIndex)); + size_t rowIndex = rowIndexOfNextBlock; + const LocatedTriples& locatedTriples = + locatedTriplesPerBlock.map_.at(blockIndex); + AD_CORRECTNESS_CHECK(locatedTriples.size() >= numIns); + for (const auto& locatedTriple : locatedTriples) { + // LOG(INFO) << "Located triple: " << locatedTriple.id1 << " " + // << locatedTriple.id2 << " " << locatedTriple.id3 + // << " rowIndexInBlock = " << locatedTriple.rowIndexInBlock + // << std::endl; + if (locatedTriple.id1 == col0Id) { + AD_CORRECTNESS_CHECK(locatedTriple.rowIndexInBlock == + std::numeric_limits::max()); + (*result)(rowIndex, 0) = locatedTriple.id2; + (*result)(rowIndex, 1) = locatedTriple.id3; + ++rowIndex; + --spaceLeft; + } else { + break; } - AD_CORRECTNESS_CHECK(spaceLeft == 0); - } // End of omp parallel region, all the decompression was handled now. + } } + AD_CORRECTNESS_CHECK(spaceLeft == 0); } // _____________________________________________________________________________ @@ -497,14 +568,56 @@ DecompressedBlock CompressedRelationReader::decompressBlock( // ____________________________________________________________________________ void CompressedRelationReader::decompressBlockToExistingIdTable( const CompressedBlock& compressedBlock, size_t numRowsToRead, - IdTable& table, size_t offsetInTable) { - AD_CORRECTNESS_CHECK(table.numRows() >= offsetInTable + numRowsToRead); + IdTable& result, size_t offsetInResult, + std::pair numInsAndDel, + const LocatedTriplesPerBlock& locatedTriplesPerBlock, size_t blockIndex) { + // Check that the given arguments are consistent (they should always be, given + // that this method is `private`). + // LOG(INFO) << "numRowsToRead: " << numRowsToRead << std::endl; + // LOG(INFO) << "numInsAndDel.first: " << numInsAndDel.first << std::endl; + // LOG(INFO) << "numInsAndDel.second: " << numInsAndDel.second << std::endl; + AD_CORRECTNESS_CHECK(numInsAndDel.second <= numRowsToRead); + AD_CORRECTNESS_CHECK(result.numRows() + numInsAndDel.second >= + offsetInResult + numRowsToRead + numInsAndDel.first); + AD_CORRECTNESS_CHECK(compressedBlock.size() == result.numColumns()); + + // Helper lambda that decompresses `numRowsToRead` from `compressedBlock` + // to the given `IdTable` iterator. + // + // TODO: It would be more natural to pass an `IdTable::iterator` here, but it + // seems that we can't get from that an iterator into an `IdTable` column. + // // TODO use zip_view. - AD_CORRECTNESS_CHECK(compressedBlock.size() == table.numColumns()); - for (size_t i = 0; i < compressedBlock.size(); ++i) { - auto col = table.getColumn(i); - decompressColumn(compressedBlock[i], numRowsToRead, - col.data() + offsetInTable); + auto decompressToIdTable = [&compressedBlock, &numRowsToRead]( + IdTable& idTable, size_t offsetInIdTable) { + size_t numColumns = compressedBlock.size(); + for (size_t i = 0; i < numColumns; ++i) { + const auto& columnFromBlock = compressedBlock[i]; + auto columnFromIdTable = idTable.getColumn(i); + decompressColumn(columnFromBlock, numRowsToRead, + columnFromIdTable.data() + offsetInIdTable); + } + }; + + // If there are no delta triples for this block, just decompress directly to + // the `result` table. Otherwise decompress to an intermediate table and merge + // from there to `result`. + // + // TODO: In the second case, we use an unlimited allocator for the space + // allocation for the intermediate table. This looks OK because our blocks are + // small, but it might be better to allocate also this table from the memory + // pool available to the server (to which we don't have acces here). + if (numInsAndDel == std::pair{0, 0}) { + decompressToIdTable(result, offsetInResult); + } else { + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable decompressedBlock(compressedBlock.size(), allocator); + decompressedBlock.resize(numRowsToRead); + decompressToIdTable(decompressedBlock, 0); + locatedTriplesPerBlock.mergeTriples(blockIndex, decompressedBlock, result, + offsetInResult); } } diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index b0bf320e04..a74c6a3e26 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -310,7 +310,10 @@ class CompressedRelationReader { // must have at least `numRowsToRead + offsetInTable` rows. static void decompressBlockToExistingIdTable( const CompressedBlock& compressedBlock, size_t numRowsToRead, - IdTable& table, size_t offsetInTable); + IdTable& table, size_t offsetInTable, + std::pair numInsAndDel = {0, 0}, + const LocatedTriplesPerBlock& locatedTriplesPerBlock = {}, + size_t blockIndex = 0); // Helper function used by `decompressBlock` and // `decompressBlockToExistingIdTable`. Decompress the `compressedColumn` and diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 99fa2f7445..657b8600e5 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -13,14 +13,14 @@ #include #include -#include "../global/Id.h" -#include "../util/File.h" -#include "../util/HashMap.h" -#include "../util/MmapVector.h" -#include "../util/ReadableNumberFact.h" -#include "../util/Serializer/Serializer.h" -#include "./MetaDataHandler.h" #include "CompressedRelation.h" +#include "global/Id.h" +#include "index/MetaDataHandler.h" +#include "util/File.h" +#include "util/HashMap.h" +#include "util/MmapVector.h" +#include "util/ReadableNumberFact.h" +#include "util/Serializer/Serializer.h" using std::array; using std::pair; diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 0046b587f9..af5d4c8834 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -12,34 +12,56 @@ // ____________________________________________________________________________ template -size_t LocatedTriplesPerBlock::numTriplesImpl(size_t blockIndex, Id id1, - Id id2) const { - size_t count = 0; +std::pair LocatedTriplesPerBlock::numTriplesImpl( + size_t blockIndex, Id id1, Id id2) const { + // If no located triples for `blockIndex` exist, there are no delta triples + // for that block. + if (!map_.contains(blockIndex)) { + return {0, 0}; + } + + // Otherwise iterate over all entries and count. + size_t countInserted = 0; + size_t countDeleted = 0; for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { + // Helper lambda for increasing the right counter. + auto increaseCountIf = [&](bool increase) { + if (increase) { + if (locatedTriple.existsInIndex) { + ++countDeleted; + } else { + ++countInserted; + } + } + }; + // Increase depending on the mode. if constexpr (matchMode == MatchMode::MatchAll) { - ++count; + increaseCountIf(true); } else if constexpr (matchMode == MatchMode::MatchId1) { - count += (locatedTriple.id1 == id1); + increaseCountIf(locatedTriple.id1 == id1); } else if constexpr (matchMode == MatchMode::MatchId1AndId2) { - count += (locatedTriple.id1 == id1 && locatedTriple.id2 == id2); + increaseCountIf(locatedTriple.id1 == id1 && locatedTriple.id2 == id2); } } - return count; + return {countInserted, countDeleted}; } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex) const { +std::pair LocatedTriplesPerBlock::numTriples( + size_t blockIndex) const { return numTriplesImpl(blockIndex); } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex, Id id1) const { +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1) const { return numTriplesImpl(blockIndex, id1); } // ____________________________________________________________________________ -size_t LocatedTriplesPerBlock::numTriples(size_t blockIndex, Id id1, - Id id2) const { +std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, + Id id1, + Id id2) const { return numTriplesImpl(blockIndex, id1, id2); } @@ -50,11 +72,10 @@ void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, size_t offsetInResult, size_t offsetOfBlock, Id id1, Id id2) const { - // This method should only be called, if located triples in that block exist. + // This method should only be called, if located triples in that block exist + // and for blocks with one or two columns. AD_CONTRACT_CHECK(map_.contains(blockIndex)); - - // TODO: For now, only implemented for two columns (easy to extend). - AD_CORRECTNESS_CHECK(block.numColumns() == 2); + AD_CONTRACT_CHECK(block.numColumns() == 1 || block.numColumns() == 2); AD_CONTRACT_CHECK(block.numColumns() == result.numColumns()); auto resultEntry = result.begin() + offsetInResult; @@ -90,8 +111,12 @@ void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, locatedTriple->rowIndexInBlock == rowIndex && locatedTriple->existsInIndex == false) { if (locatedTripleMatches()) { - (*resultEntry)[0] = locatedTriple->id2; - (*resultEntry)[1] = locatedTriple->id3; + if (result.numColumns() == 2) { + (*resultEntry)[0] = locatedTriple->id2; + (*resultEntry)[1] = locatedTriple->id3; + } else { + (*resultEntry)[0] = locatedTriple->id3; + } ++resultEntry; } ++locatedTriple; diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index 8e5dab090d..08c0b014aa 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -58,11 +58,12 @@ class LocatedTriplesPerBlock { ad_utility::HashMap map_; public: - // Get the number of located triples for the given block and that match the - // `id1` (if provided) and `id2` (if provided). - size_t numTriples(size_t blockIndex) const; - size_t numTriples(size_t blockIndex, Id id1) const; - size_t numTriples(size_t blockIndex, Id id1, Id id2) const; + // Get the number of to-be-inserted (first) and to-be-deleted (second) triples + // for the given block and that match the `id1` (if provided) and `id2` (if + // provided). + std::pair numTriples(size_t blockIndex) const; + std::pair numTriples(size_t blockIndex, Id id1) const; + std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; // Merge the located triples for `blockIndex` into the given `block` (which // might be the whole block with that index or just a part of it) and write @@ -115,8 +116,9 @@ class LocatedTriplesPerBlock { // The Implementation behind the public method `numTriplesInBlock` above. template - size_t numTriplesImpl(size_t blockIndex, Id id1 = Id::makeUndefined(), - Id id2 = Id::makeUndefined()) const; + std::pair numTriplesImpl(size_t blockIndex, + Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined()) const; // The Implementation behind the public method `mergeTriplesIntoBlock` above. // The only reason that the arguments `id1` and `id2` come at the end here is diff --git a/src/index/Permutations.h b/src/index/Permutations.h index 00e0118acc..2751eaf8bc 100644 --- a/src/index/Permutations.h +++ b/src/index/Permutations.h @@ -38,7 +38,9 @@ class PermutationImpl { _fileSuffix(std::move(suffix)), _keyOrder(order) {} - // everything that has to be done when reading an index from disk + // Initialize this permutation based on its index file(s) on disk. For PSO and + // PSO, this is one file named `.index.pos` or `.index.pso`, respectively. For + // the other permutations, there is also a `.meta` file. void loadFromDisk(const std::string& onDiskBase) { if constexpr (MetaData::_isMmapBased) { _meta.setup(onDiskBase + ".index" + _fileSuffix + MMAP_FILE_SUFFIX, @@ -58,9 +60,9 @@ class PermutationImpl { _isLoaded = true; } - /// For a given ID for the first column, retrieve all IDs of the second and - /// third column, and store them in `result`. This is just a thin wrapper - /// around `CompressedRelationMetaData::scan`. + // For a given relation `Id` (first column), retrieve all `Id`s of the second + // and third column, and store them in `result`. This is just a thin wrapper + // around the corresponding `CompressedRelationMetaData::scan`. template void scan(Id col0Id, IdTableImpl* result, ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const { @@ -71,23 +73,25 @@ class PermutationImpl { if (!_meta.col0IdExists(col0Id)) { return; } - const auto& metaData = _meta.getMetaData(col0Id); - return _reader.scan(metaData, _meta.blockData(), _file, result, - std::move(timer), locatedTriplesPerBlock_); + const auto& metadataForRelation = _meta.getMetaData(col0Id); + const auto& metadataForAllBlocks = _meta.blockData(); + return _reader.scan(metadataForRelation, metadataForAllBlocks, _file, + result, std::move(timer), locatedTriplesPerBlock_); } - /// For given IDs for the first and second column, retrieve all IDs of the - /// third column, and store them in `result`. This is just a thin wrapper - /// around `CompressedRelationMetaData::scan`. + + // For a given relation `Id` (first column) and `Id` for the second column, + // retrieve all `Id`s of the third column, and store them in `result`. Also + // just a wrapper around the corresponding `CompressedRelationMetaData::scan`. template void scan(Id col0Id, Id col1Id, IdTableImpl* result, ad_utility::SharedConcurrentTimeoutTimer timer = nullptr) const { if (!_meta.col0IdExists(col0Id)) { return; } - const auto& metaData = _meta.getMetaData(col0Id); - - return _reader.scan(metaData, col1Id, _meta.blockData(), _file, result, - timer, locatedTriplesPerBlock_); + const auto& metadataForRelation = _meta.getMetaData(col0Id); + const auto& metadataForAllBlocks = _meta.blockData(); + return _reader.scan(metadataForRelation, col1Id, metadataForAllBlocks, + _file, result, timer, locatedTriplesPerBlock_); } // _______________________________________________________ diff --git a/src/util/AllocatorWithLimit.h b/src/util/AllocatorWithLimit.h index d45f6499bd..16185faa2b 100644 --- a/src/util/AllocatorWithLimit.h +++ b/src/util/AllocatorWithLimit.h @@ -89,8 +89,8 @@ class AllocationMemoryLeftThreadsafe { }; } // namespace detail -// setup a shared Allocation state. For the usage see documentation of the -// Limited Allocator class +// Setup a shared Allocation state. For the usage see documentation of the +// `AllocatorWithLimit` class. inline detail::AllocationMemoryLeftThreadsafe makeAllocationMemoryLeftThreadsafeObject(size_t n) { return detail::AllocationMemoryLeftThreadsafe{std::make_shared< diff --git a/test/CompressedRelationsTest.cpp b/test/CompressedRelationsTest.cpp index d05b1d12c1..b7f5a5f0a1 100644 --- a/test/CompressedRelationsTest.cpp +++ b/test/CompressedRelationsTest.cpp @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #include @@ -103,8 +103,8 @@ void testCompressedRelations(const std::vector& inputs, ad_utility::File file{filename, "r"}; auto timer = std::make_shared( ad_utility::TimeoutTimer::unlimited()); - // Check the contents of the metadata. + // Check the contents of the metadata. CompressedRelationReader reader; for (size_t i = 0; i < metaData.size(); ++i) { const auto& m = metaData[i]; diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 00d952e990..254600d6a3 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -6,7 +6,10 @@ #include "./util/IdTableHelpers.h" #include "./util/IdTestHelpers.h" +#include "index/CompressedRelation.h" +#include "index/IndexMetaData.h" #include "index/LocatedTriples.h" +#include "index/Permutations.h" // TODO: Why the namespace here? (copied from `test/IndexMetaDataTest.cpp`) namespace { @@ -34,35 +37,39 @@ TEST_F(LocatedTriplesTest, numTriplesInBlock) { auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( {LocatedTriple{1, 0, V(10), V(1), V(0), true}, LocatedTriple{1, 0, V(10), V(2), V(1), true}, - LocatedTriple{1, 0, V(11), V(3), V(0), true}, - LocatedTriple{2, 0, V(20), V(4), V(0), true}, - LocatedTriple{2, 0, V(21), V(5), V(0), true}, - LocatedTriple{3, 0, V(30), V(6), V(0), true}, + LocatedTriple{1, 0, V(11), V(3), V(0), false}, + LocatedTriple{2, 0, V(20), V(4), V(0), false}, + LocatedTriple{2, 0, V(21), V(5), V(0), false}, + LocatedTriple{3, 0, V(30), V(6), V(0), false}, LocatedTriple{3, 0, V(32), V(7), V(0), true}}); ASSERT_EQ(locatedTriplesPerBlock.numBlocks(), 3); ASSERT_EQ(locatedTriplesPerBlock.numTriples(), 7); + auto P = [](size_t n1, size_t n2) -> std::pair { + return {n1, n2}; + }; + // Check the total counts per block. - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), 3); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), 2); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), 2); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1), P(1, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2), P(2, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3), P(1, 1)); // Check the counts per block for a given `id1`. - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), 2); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10)), P(0, 2)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32)), P(0, 1)); // Check the counts per block for a given `id1` and `id2`. - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), 1); - ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), 1); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(1)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(10), V(2)), P(0, 1)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(1, V(11), V(3)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(20), V(4)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(2, V(21), V(5)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(30), V(6)), P(1, 0)); + ASSERT_EQ(locatedTriplesPerBlock.numTriples(3, V(32), V(7)), P(0, 1)); } // Test the method that merges the matching `LocatedTriple`s from a block into a @@ -121,10 +128,10 @@ TEST_F(LocatedTriplesTest, mergeTriples) { // Repeat but with a partial block that leaves out the first two elements of // `block` (and correspondingly `offsetOfBlock == 2` in `mergeTriples`). - IdTable blockTruncated = block.clone(); - std::shift_left(blockTruncated.begin(), blockTruncated.end(), 2); - blockTruncated.resize(block.size() - 2); { + IdTable blockTruncated = block.clone(); + std::shift_left(blockTruncated.begin(), blockTruncated.end(), 2); + blockTruncated.resize(block.size() - 2); IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 {20, 10}, // Row 1 {21, 11}, // Row 2 @@ -136,17 +143,140 @@ TEST_F(LocatedTriplesTest, mergeTriples) { ASSERT_EQ(result, resultExpected); } - // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the same - // truncated block as above (one triples inserted, one triple deleted). + // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the + // corresponding partial block (one triple inserted, one triple deleted). + // + // TODO: I don't think this case can actually occur in our code. When `id1` + // and `id2` are specified, we are only interesting in `id3` for result. { - IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 - {20, 10}, // Row 1 - {30, 10}, // Row 2 - {30, 30}}); // Row 3 - IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + IdTable blockTruncated = makeIdTableFromVector({{30, 20}, {30, 30}}); + IdTable resultExpected = makeIdTableFromVector({{30, 10}, {30, 30}}); + IdTable result(blockTruncated.numColumns(), + ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 2, V(2), + locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 4, V(2), V(30)); ASSERT_EQ(result, resultExpected); } + + // Same, but only with the last column. + { + IdTable blockTruncated = makeIdTableFromVector({{20}, {30}}); + IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); + IdTable result(blockTruncated.numColumns(), + ad_utility::testing::makeAllocator()); + result.resize(resultExpected.size()); + locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 4, V(2), + V(30)); + ASSERT_EQ(result, resultExpected); + } +} + +// Test `Permutation::scan` (and hence also `CompressedRelation::scan`) with +// triples merged from a `locatedTriplesPerBlock` object. +TEST_F(LocatedTriplesTest, scanWithMergeTriples) { + // TODO: Test with multiple block sizes. + size_t blockSizeInBytes = 32; + std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; + std::string permutationFilename = basename + ".index.pso"; + + // Helper lambda for creating a `BufferedIdTable` (which we need for + // `CompressedRelationWriter` from an ordinary `IdTable` with two columns). + // + // TODO: Something like this is also used in `CompressedRelationsTest`, so it + // should be in a helper class. + auto getBufferedIdTable = [](const IdTable& idTable) -> BufferedIdTable { + // Note that these files are never created because we set the threshold for + // writing to disk so large. + std::string bufferFilename1 = "compressedRelationWriter.buffer1.dat"; + std::string bufferFilename2 = "compressedRelationWriter.buffer2.dat"; + AD_CONTRACT_CHECK(idTable.numColumns() == 2); + BufferedIdTable bufferedIdTable{ + 2, + std::array{ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename1}, + ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename2}}}; + for (size_t i = 0; i < idTable.size(); ++i) { + bufferedIdTable.push_back({idTable(i, 0), idTable(i, 1)}); + } + return bufferedIdTable; + }; + + // Our test relation. + Id relationId = V(1); + IdTable relation = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 + + // Write it to disk (adapted from `CompressedRelationsTest`). The last value + // of the call to `addRelation` is the number of distinct elements. + ad_utility::File permutationFileForWritingRelations{permutationFilename, "w"}; + CompressedRelationWriter writer{std::move(permutationFileForWritingRelations), + blockSizeInBytes}; + writer.addRelation(relationId, getBufferedIdTable(relation), relation.size()); + writer.finish(); + auto metadataPerRelation = writer.getFinishedMetaData(); + auto metadataPerBlock = writer.getFinishedBlocks(); + AD_CORRECTNESS_CHECK(metadataPerRelation.size() == 1); + + // Append the metadata to the index file. + IndexMetaDataHmap metadata; + std::ranges::for_each(metadataPerRelation, + [&metadata](auto& md) { metadata.add(md); }); + metadata.blockData() = metadataPerBlock; + ad_utility::File permutationFileForWritingMetadata{permutationFilename, "r+"}; + metadata.appendToFile(&permutationFileForWritingMetadata); + permutationFileForWritingMetadata.close(); + + // Create a permutation based on this. + LocatedTriplesPerBlock locatedTriplesPerBlock; + Permutation::PermutationImpl permutation{ + SortByPSO(), "PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; + permutation.loadFromDisk(basename); + // ad_utility::File permutationFileForReading{permutationFilename, "r"}; + // permutation._file = std::move(permutationFileForReading); + // permutation._meta = metadata; + // permutation._isLoaded = true; + + // Read the (for this test: first and only) relation from disk and check that + // it is the same. + IdTable result(relation.numColumns(), ad_utility::testing::makeAllocator()); + permutation.scan(relationId, &result); + // CompressedRelationReader reader; + // reader.scan(metadataPerRelation[0], metadataPerBlock, permutation._file, + // &result, ad_utility::SharedConcurrentTimeoutTimer{}); + ASSERT_EQ(result, relation); + + // Helper lambda for adding to `locatedTriplesPerBlock`. + auto locatedTriplesPerBlockAdd = [&locatedTriplesPerBlock, &relationId, + &permutation](Id id2, Id id3) { + locatedTriplesPerBlock.add(LocatedTriple::locateTripleInPermutation( + relationId, id2, id3, permutation)); + }; + + // Again, but with some located triples merged (three inserts, four deletes). + locatedTriplesPerBlockAdd(V(15), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(14), V(20)); // Insert. + locatedTriplesPerBlockAdd(V(20), V(10)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(30)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(31)); // Insert at very end. + locatedTriplesPerBlockAdd(V(30), V(32)); // Insert at very end. + permutation.scan(relationId, &result); + // reader.scan(metadataPerRelation[0], metadataPerBlock, permutation._file, + // &result, ad_utility::SharedConcurrentTimeoutTimer{}, + // locatedTriplesPerBlock); + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {14, 20}, // Row 1 + {15, 30}, // Row 2 + {30, 31}, // Row 3 + {30, 32}}); // Row 4 + ASSERT_EQ(result, resultExpected); + + // Delete the file with the compressed relations. + ad_utility::deleteFile(permutationFilename); } diff --git a/test/util/AllocatorTestHelpers.h b/test/util/AllocatorTestHelpers.h index 0666c70488..c71b8687fe 100644 --- a/test/util/AllocatorTestHelpers.h +++ b/test/util/AllocatorTestHelpers.h @@ -1,6 +1,6 @@ -// Copyright 2023, University of Freiburg, -// Chair of Algorithms and Data Structures. -// Author: Johannes Kalmbach +// Copyright 2023, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach #pragma once @@ -10,9 +10,9 @@ namespace ad_utility::testing { // Create an unlimited allocator. inline ad_utility::AllocatorWithLimit& makeAllocator() { - static ad_utility::AllocatorWithLimit a{ + static ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; - return a; + return allocator; } } // namespace ad_utility::testing From 15eaf482bd056089383e3ef9a5995c4e11f48318 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Mon, 3 Apr 2023 18:44:30 +0200 Subject: [PATCH 12/20] Delta triples merged for both scan types now The variant of `CompressedRelation::scan` with two `Id`s fixed now also considers delta triples. This was significantly more complicated than for the variant with only one `Id` fixed and required quite a bit of refactoring. TODO: For the first incomplete block when only a single `Id` is fixed, delta triples are still not considered. That should be easy to add though. --- src/index/CompressedRelation.cpp | 421 +++++++++++++++++++------------ src/index/LocatedTriples.cpp | 159 ++++++++---- src/index/LocatedTriples.h | 56 ++-- test/LocatedTriplesTest.cpp | 265 ++++++++++--------- 4 files changed, 540 insertions(+), 361 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 596f3326ae..3de1184102 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -89,7 +89,7 @@ void CompressedRelationReader::scan( // TODO: For now only consider delta triples in complete blocks. if (firstBlockIsIncomplete) { - AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() == 1); + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() == 2); numInsTotal = 0; numDelTotal = 0; LOG(WARN) << "Delta triples in incomplete block ignored!" << std::endl; @@ -101,7 +101,7 @@ void CompressedRelationReader::scan( // The position in the result to which the next block is being // decompressed. - size_t rowIndexOfNextBlock = 0; + size_t offsetInResult = 0; // The number of rows for which we still have space // in the result (only needed for checking of invariants). @@ -112,6 +112,10 @@ void CompressedRelationReader::scan( // the result. auto readIncompleteBlock = [&](const auto& block) { // A block is uniquely identified by its start position in the file. + // + // NOTE: We read these blocks via a cache in order to speed up the unit + // tests (which make many requests to the same block, so we don't want to + // decompress it again and again). auto cacheKey = block._offsetsAndCompressedSize.at(0)._offsetInFile; auto uncompressedBuffer = blockCache_ .computeOnce(cacheKey, @@ -132,14 +136,16 @@ void CompressedRelationReader::scan( AD_CORRECTNESS_CHECK(numElements <= spaceLeft); std::copy(begin, begin + numElements, resultColumn.begin()); } - rowIndexOfNextBlock += numElements; + offsetInResult += numElements; spaceLeft -= numElements; }; // Read the first block if it is incomplete + auto completeBlocksBegin = beginBlock; + auto completeBlocksEnd = endBlock; if (firstBlockIsIncomplete) { readIncompleteBlock(*beginBlock); - ++beginBlock; + ++completeBlocksBegin; if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan :"); } @@ -147,99 +153,68 @@ void CompressedRelationReader::scan( // Process all the other (complete) blocks. The compressed blocks are read // sequentially from disk and then decompressed in parallel. - const size_t blockIndexBegin = beginBlock - metadataForAllBlocks.begin(); - size_t blockIndex = blockIndexBegin; - if (beginBlock < endBlock) { + if (completeBlocksBegin < completeBlocksEnd) { #pragma omp parallel #pragma omp single - { - for (; beginBlock < endBlock; ++beginBlock) { - const auto& block = *beginBlock; - std::pair numInsAndDel = - numInsAndDelPerBlock.at(blockIndex - blockIndexBegin); - - // Read the compressed block from disk (sequentially). - CompressedBlock compressedBuffer = - readCompressedBlockFromFile(block, file, std::nullopt); - - // This lambda decompresses the block that was just read to the - // correct position in the result. - auto decompressLambda = [&result, rowIndexOfNextBlock, &block, - &numInsAndDel, &locatedTriplesPerBlock, - &blockIndex, - compressedBuffer = - std::move(compressedBuffer)]() { - ad_utility::TimeBlockAndLog tbl{"Decompressing a block"}; - - decompressBlockToExistingIdTable( - compressedBuffer, block._numRows, *result, rowIndexOfNextBlock, - numInsAndDel, locatedTriplesPerBlock, blockIndex); - }; + for (auto block = completeBlocksBegin; block < completeBlocksEnd; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(block - beginBlock); - // This `decompressLambda` can run concurrently. -#pragma omp task - { - if (!timer || !timer->wlock()->hasTimedOut()) { - decompressLambda(); - }; - } + // Read the compressed block from disk (both columns). + CompressedBlock compressedBuffer = + readCompressedBlockFromFile(*block, file, std::nullopt); + + // This lambda decompresses the block that was just read to the + // correct position in the result. + auto decompressLambda = [&result, &locatedTriplesPerBlock, &block, + numInsAndDel, offsetInResult, blockIndex, + compressedBuffer = + std::move(compressedBuffer)]() { + ad_utility::TimeBlockAndLog tbl{"Decompressing a block"}; + + decompressBlockToExistingIdTable(compressedBuffer, block->_numRows, + *result, offsetInResult, numInsAndDel, + locatedTriplesPerBlock, blockIndex); + }; - // This is again serial code, which sets up the correct pointers for the - // next block. - AD_CORRECTNESS_CHECK(numInsAndDel.second <= block._numRows); - size_t numRowsOfThisBlock = - block._numRows + numInsAndDel.first - numInsAndDel.second; - AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); - spaceLeft -= numRowsOfThisBlock; - rowIndexOfNextBlock += numRowsOfThisBlock; - ++blockIndex; + // This `decompressLambda` can run concurrently. +#pragma omp task + { + if (!timer || !timer->wlock()->hasTimedOut()) { + decompressLambda(); + }; } + + // Update the counters. + AD_CORRECTNESS_CHECK(numInsAndDel.second <= block->_numRows); + size_t numRowsOfThisBlock = + block->_numRows + numInsAndDel.first - numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); + spaceLeft -= numRowsOfThisBlock; + offsetInResult += numRowsOfThisBlock; } // End of omp parallel region, all blocks are decompressed now. } - // Check whether there are relevant delta triples in the next block. If yes, - // these must all come contiguously at the very beginning of that block, have - // `rowIndexInBlock == std::numeric_limits::max()` and `id1 == col0Id` - // and must all be inserts. - // - // TODO: This should be a separate function (of `LocatedTriplesPerBlock`?). - AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() >= 1); - size_t numIns = numInsAndDelPerBlock.back().first; - if (numIns > 0) { - // LOG(INFO) << "Triples to be inserted after last block" << std::endl; - // LOG(INFO) << "numInsAndDel.first: " << numIns << std::endl; - AD_CORRECTNESS_CHECK(result->numRows() >= rowIndexOfNextBlock + numIns); - AD_CORRECTNESS_CHECK(locatedTriplesPerBlock.map_.contains(blockIndex)); - size_t rowIndex = rowIndexOfNextBlock; - const LocatedTriples& locatedTriples = - locatedTriplesPerBlock.map_.at(blockIndex); - AD_CORRECTNESS_CHECK(locatedTriples.size() >= numIns); - for (const auto& locatedTriple : locatedTriples) { - // LOG(INFO) << "Located triple: " << locatedTriple.id1 << " " - // << locatedTriple.id2 << " " << locatedTriple.id3 - // << " rowIndexInBlock = " << locatedTriple.rowIndexInBlock - // << std::endl; - if (locatedTriple.id1 == col0Id) { - AD_CORRECTNESS_CHECK(locatedTriple.rowIndexInBlock == - std::numeric_limits::max()); - (*result)(rowIndex, 0) = locatedTriple.id2; - (*result)(rowIndex, 1) = locatedTriple.id3; - ++rowIndex; - --spaceLeft; - } else { - break; - } - } + // Add delta triples from beyond last block, if any. + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() > 0); + auto numInsBeyondLastBlock = numInsAndDelPerBlock.back().first; + if (numInsBeyondLastBlock > 0) { + size_t blockIndex = endBlock - metadataForAllBlocks.begin(); + size_t numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::nullopt, *result, offsetInResult, col0Id); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numInsBeyondLastBlock); + spaceLeft -= numRowsWrittenToResult; } AD_CORRECTNESS_CHECK(spaceLeft == 0); } // _____________________________________________________________________________ void CompressedRelationReader::scan( - const CompressedRelationMetadata& metaData, Id col1Id, - const vector& blocks, ad_utility::File& file, - IdTable* result, ad_utility::SharedConcurrentTimeoutTimer timer, + const CompressedRelationMetadata& metadataForRelation, Id col1Id, + const vector& metadataForAllBlocks, + ad_utility::File& file, IdTable* result, + ad_utility::SharedConcurrentTimeoutTimer timer, const LocatedTriplesPerBlock& locatedTriplesPerBlock) const { AD_CONTRACT_CHECK(result->numColumns() == 1); @@ -259,125 +234,221 @@ void CompressedRelationReader::scan( return endBeforeBegin; }; - Id col0Id = metaData._col0Id; + Id col0Id = metadataForRelation._col0Id; // Note: See the comment in the other overload for `scan` above for the // reason why we (currently) can't use a structured binding here. - decltype(blocks.begin()) beginBlock, endBlock; + decltype(metadataForAllBlocks.begin()) beginBlock, endBlock; std::tie(beginBlock, endBlock) = - std::equal_range(blocks.begin(), blocks.end(), + std::equal_range(metadataForAllBlocks.begin(), metadataForAllBlocks.end(), KeyLhs{col0Id, col0Id, col1Id, col1Id}, comp); - // PRELIMINARY: Say how many delta triples are contained in those blocks. - size_t numDeltaTriples = 0; - for (auto block = beginBlock; block < endBlock; ++block) { - size_t blockIndex = block - blocks.begin(); - if (locatedTriplesPerBlock.map_.contains(blockIndex)) { - numDeltaTriples += locatedTriplesPerBlock.map_.at(blockIndex).size(); - } + // Compute the number of inserted and deleted triples per block and overall. + // note the `<=` so that we don't forget the block beyond the last (which may + // have information about delta triples at the vey end of a relation). + std::vector> numInsAndDelPerBlock; + size_t numInsTotal = 0; + size_t numDelTotal = 0; + for (auto block = beginBlock; block <= endBlock; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto [numIns, numDel] = + block == beginBlock || block == endBlock - 1 || block == endBlock + ? locatedTriplesPerBlock.numTriples(blockIndex, col0Id, col1Id) + : locatedTriplesPerBlock.numTriples(blockIndex); + numInsTotal += numIns; + numDelTotal += numDel; + numInsAndDelPerBlock.push_back({numIns, numDel}); + } + if (numInsTotal > 0 || numDelTotal > 0) { + LOG(INFO) << "Index scan with delta triples: #inserts = " << numInsTotal + << ", #deletes = " << numDelTotal + << ", #blocks = " << (endBlock - beginBlock) << std::endl; + AD_CORRECTNESS_CHECK(numDelTotal < metadataForRelation.getNofElements()); } - LOG(INFO) << "Number of delta triples in blocks scanned: " << numDeltaTriples - << std::endl; // Invariant: The col0Id is completely stored in a single block, or it is // contained in multiple blocks that only contain this col0Id, - bool col0IdHasExclusiveBlocks = - metaData._offsetInBlock == std::numeric_limits::max(); + bool col0IdHasExclusiveBlocks = metadataForRelation._offsetInBlock == + std::numeric_limits::max(); if (!col0IdHasExclusiveBlocks) { // This might also be zero if no block was found at all. AD_CORRECTNESS_CHECK(endBlock - beginBlock <= 1); } - // The first and the last block might be incomplete (that is, only - // a part of these blocks is actually part of the result, - // set up a lambda which allows us to read these blocks, and returns - // the result as a vector. - auto readPossiblyIncompleteBlock = [&](const auto& block) { - DecompressedBlock uncompressedBuffer = - readAndDecompressBlock(block, file, std::nullopt); - AD_CORRECTNESS_CHECK(uncompressedBuffer.numColumns() == 2); - const auto& col1Column = uncompressedBuffer.getColumn(0); - const auto& col2Column = uncompressedBuffer.getColumn(1); - AD_CORRECTNESS_CHECK(col1Column.size() == col2Column.size()); - - // Find the range in the block, that belongs to the same relation `col0Id` - bool containedInOnlyOneBlock = - metaData._offsetInBlock != std::numeric_limits::max(); - auto begin = col1Column.begin(); - if (containedInOnlyOneBlock) { - begin += metaData._offsetInBlock; - } - auto end = - containedInOnlyOneBlock ? begin + metaData._numRows : col1Column.end(); - - // Find the range in the block, where also the col1Id matches (the second - // ID in the `std::array` does not matter). - std::tie(begin, end) = std::equal_range(begin, end, col1Id); + // Helper class for a part of a block (needed for the first and last block in + // the following). These are small objects, so an unlimited allocator is OK. + struct BlockPart { + std::unique_ptr idTable = nullptr; + size_t rowIndexBegin = 0; + size_t rowIndexEnd = 0; + size_t blockIndex = 0; + std::pair numInsAndDel; + size_t size() const { return rowIndexEnd - rowIndexBegin; } + }; - size_t beginIndex = begin - col1Column.begin(); - size_t endIndex = end - col1Column.begin(); + // Helper lambda that extracts the relevant `Id`s from the given + // `blockMetadata` iterator. Returns the corresponding part and its (row + // index) begin and end index in the original block. + // + // NOTE: This is used for the first and last block below because these may + // contain triples that do not match `col0Id` and `col1Id`. We cannot directly + // merge these into `result` because we first need to know its total size and + // resize it before we can write to it. + auto getBlockPart = [&](auto blockMetadata) -> BlockPart { + DecompressedBlock block = + readAndDecompressBlock(*blockMetadata, file, std::nullopt); + + // First find the range with matching `col0Id`. The `if` condition asks if + // the relation is contained in a single block (this one). + auto blockPartBegin = block.begin(); + auto blockPartEnd = block.end(); + if (metadataForRelation._offsetInBlock != + std::numeric_limits::max()) { + blockPartBegin += metadataForRelation._offsetInBlock; + blockPartEnd = blockPartBegin + metadataForRelation._numRows; + AD_CORRECTNESS_CHECK(blockPartBegin < block.end()); + AD_CORRECTNESS_CHECK(blockPartEnd <= block.end()); + } - // Only extract the relevant portion of the second column. - std::vector result(col2Column.begin() + beginIndex, - col2Column.begin() + endIndex); - return result; + // Within that range find the subrange, where also `col1Id` matches. + std::tie(blockPartBegin, blockPartEnd) = std::equal_range( + blockPartBegin, blockPartEnd, std::array{col1Id}, + [](const auto& x, const auto& y) { return x[0] < y[0]; }); + // std::cout << "Block part: "; + // std::transform(blockPartBegin, blockPartEnd, + // std::ostream_iterator(std::cout, " "), + // [](const auto& row) { + // return absl::StrCat("{", row[0], " ", row[1], "}"); + // }); + // std::cout << std::endl; + + // Variables for the index of this block and the range. + // + // TODO: `IndexTest.scanTest` failes if we check `rowIndexEnd > + // rowIndexBegin` instead of just `>='. Can this really happen? + size_t rowIndexBegin = blockPartBegin - block.begin(); + size_t rowIndexEnd = blockPartEnd - block.begin(); + AD_CORRECTNESS_CHECK(rowIndexBegin < block.size()); + AD_CORRECTNESS_CHECK(rowIndexEnd <= block.size()); + AD_CORRECTNESS_CHECK(rowIndexEnd >= rowIndexBegin); + size_t blockIndex = blockMetadata - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(blockMetadata - beginBlock); + + // Copy `block` to an `IdTable`. + // + // TODO: This is an unecessary copy. Extend the `IdTable` class so that we + // can move the data from the second column of `block` to `blockAsIdTable`. + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable result(1, allocator); + result.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + result(i, 0) = block(i, 1); + } + return {std::make_unique(std::move(result)), rowIndexBegin, + rowIndexEnd, blockIndex, numInsAndDel}; }; - // The first and the last block might be incomplete, compute - // and store the partial results from them. - std::vector firstBlockResult, lastBlockResult; + // The first and the last block might be incomplete. We process them + // separately from the complete blocks inbetween. + BlockPart firstBlockPart; + BlockPart lastBlockPart; + auto completeBlocksBegin = beginBlock; + auto completeBlocksEnd = endBlock; if (beginBlock < endBlock) { - firstBlockResult = readPossiblyIncompleteBlock(*beginBlock); - ++beginBlock; + firstBlockPart = getBlockPart(beginBlock); + ++completeBlocksBegin; if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan: "); } } - if (beginBlock < endBlock) { - lastBlockResult = readPossiblyIncompleteBlock(*(endBlock - 1)); - endBlock--; + if (completeBlocksBegin < endBlock) { + --completeBlocksEnd; + lastBlockPart = getBlockPart(completeBlocksEnd); if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan: "); } } - // Determine the total size of the result. - // First accumulate the complete blocks in the "middle" - auto totalResultSize = std::accumulate( - beginBlock, endBlock, 0ul, [](const auto& count, const auto& block) { - return count + block._numRows; - }); - // Add the possibly incomplete blocks from the beginning and end; - totalResultSize += firstBlockResult.size() + lastBlockResult.size(); - + // The total result size is the size of complete blocks plus the size of the + // possibly incomplete blocks at the beginning and end, plus the number of + // inserted triples minus the number of deleted triples. + auto totalResultSize = + std::accumulate(completeBlocksBegin, completeBlocksEnd, 0ul, + [](const auto& count, const auto& block) { + return count + block._numRows; + }); + totalResultSize += firstBlockPart.size() + lastBlockPart.size(); + AD_CORRECTNESS_CHECK(numDelTotal <= totalResultSize); + totalResultSize += numInsTotal - numDelTotal; result->resize(totalResultSize); + size_t spaceLeft = result->size(); + size_t offsetInResult = 0; - // Insert the first block into the result; - std::copy(firstBlockResult.begin(), firstBlockResult.end(), - result->getColumn(0).data()); - size_t rowIndexOfNextBlockStart = firstBlockResult.size(); + // Helper lambda for processing the first or last block. + // + // NOTE: This should only be called once for a given `BlockPart` because the + // `idTable` is moved away from it. + auto processBlockPart = [&](BlockPart& blockPart) { + if (blockPart.idTable) { + size_t numRowsWrittenToResult = 0; + // If there are no delta triples, copy directly to the result, otherwise + // use (the slightly more expensive) `mergeTriples`. + if (blockPart.numInsAndDel == std::pair{0, 0}) { + for (size_t i = 0; i < blockPart.size(); ++i) { + (*result)(offsetInResult + i, 0) = + (*blockPart.idTable)(blockPart.rowIndexBegin + i, 0); + } + numRowsWrittenToResult = blockPart.size(); + } else { + numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockPart.blockIndex, std::move(*(blockPart.idTable)), *result, + offsetInResult, col0Id, col1Id, blockPart.rowIndexBegin, + blockPart.rowIndexEnd); + } + // Check that `numRowsWrittenToResult` is as expected. + { + size_t expected = blockPart.size(); + AD_CORRECTNESS_CHECK(blockPart.numInsAndDel.second <= expected); + expected += blockPart.numInsAndDel.first; + expected -= blockPart.numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == expected); + } + AD_CORRECTNESS_CHECK(numRowsWrittenToResult <= spaceLeft); + offsetInResult += numRowsWrittenToResult; + spaceLeft -= numRowsWrittenToResult; + } + }; - // Insert the complete blocks from the middle in parallel - if (beginBlock < endBlock) { + // Process the first block part, then all the complete blocks, then the last + // block part. The complete blocks are read sequentially from disk and then + // (after we know their position in `result`) decompressed and merged into + // `result` in parallel. + processBlockPart(firstBlockPart); + if (completeBlocksBegin < completeBlocksEnd) { #pragma omp parallel #pragma omp single - for (; beginBlock < endBlock; ++beginBlock) { - const auto& block = *beginBlock; + for (auto block = completeBlocksBegin; block < completeBlocksEnd; ++block) { + size_t blockIndex = block - metadataForAllBlocks.begin(); + auto numInsAndDel = numInsAndDelPerBlock.at(block - beginBlock); - // Read the block serially, only read the second column. - AD_CORRECTNESS_CHECK(block._offsetsAndCompressedSize.size() == 2); + // Read the compressed block from disk (second column only). + AD_CORRECTNESS_CHECK(block->_offsetsAndCompressedSize.size() == 2); CompressedBlock compressedBuffer = - readCompressedBlockFromFile(block, file, std::vector{1ul}); + readCompressedBlockFromFile(*block, file, std::vector{1ul}); // A lambda that owns the compressed block decompresses it to the // correct position in the result. It may safely be run in parallel - auto decompressLambda = [rowIndexOfNextBlockStart, &block, result, + auto decompressLambda = [&result, &block, &locatedTriplesPerBlock, + offsetInResult, numInsAndDel, blockIndex, compressedBuffer = std::move(compressedBuffer)]() mutable { ad_utility::TimeBlockAndLog tbl{"Decompression a block"}; - decompressBlockToExistingIdTable(compressedBuffer, block._numRows, - *result, rowIndexOfNextBlockStart); + decompressBlockToExistingIdTable(compressedBuffer, block->_numRows, + *result, offsetInResult, numInsAndDel, + locatedTriplesPerBlock, blockIndex); }; // Register an OpenMP task that performs the decompression of this @@ -389,15 +460,29 @@ void CompressedRelationReader::scan( } } - // update the pointers - rowIndexOfNextBlockStart += block._numRows; - } // end of parallel region + // Update the counters. + AD_CORRECTNESS_CHECK(numInsAndDel.second <= block->_numRows); + size_t numRowsOfThisBlock = + block->_numRows + numInsAndDel.first - numInsAndDel.second; + AD_CORRECTNESS_CHECK(numRowsOfThisBlock <= spaceLeft); + spaceLeft -= numRowsOfThisBlock; + offsetInResult += numRowsOfThisBlock; + } + // End of omp parallel region, all blocks are decompressed now. } - // Add the last block. - std::copy(lastBlockResult.begin(), lastBlockResult.end(), - result->getColumn(0).data() + rowIndexOfNextBlockStart); - AD_CORRECTNESS_CHECK(rowIndexOfNextBlockStart + lastBlockResult.size() == - result->size()); + processBlockPart(lastBlockPart); + + // Add delta triples from beyond last block, if any. + AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() > 0); + auto numInsBeyondLastBlock = numInsAndDelPerBlock.back().first; + if (numInsBeyondLastBlock > 0) { + size_t blockIndex = endBlock - metadataForAllBlocks.begin(); + size_t numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::nullopt, *result, offsetInResult, col0Id, col1Id); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numInsBeyondLastBlock); + spaceLeft -= numRowsWrittenToResult; + } + AD_CORRECTNESS_CHECK(spaceLeft == 0); } // _____________________________________________________________________________ @@ -616,8 +701,8 @@ void CompressedRelationReader::decompressBlockToExistingIdTable( IdTable decompressedBlock(compressedBlock.size(), allocator); decompressedBlock.resize(numRowsToRead); decompressToIdTable(decompressedBlock, 0); - locatedTriplesPerBlock.mergeTriples(blockIndex, decompressedBlock, result, - offsetInResult); + locatedTriplesPerBlock.mergeTriples( + blockIndex, std::move(decompressedBlock), result, offsetInResult); } } diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index af5d4c8834..755ef3ef35 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -67,17 +67,36 @@ std::pair LocatedTriplesPerBlock::numTriples(size_t blockIndex, // ____________________________________________________________________________ template -void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, - const IdTable& block, IdTable& result, - size_t offsetInResult, - size_t offsetOfBlock, Id id1, - Id id2) const { - // This method should only be called, if located triples in that block exist - // and for blocks with one or two columns. +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + // This method should only be called, if located triples in that block exist. + // The two `rowIndexInBlock`s should define a valid non-empty range. Both + // `block` and `result` must have one column for `MatchMode::MatchId1AndId2` + // and two columns otherwise. If `block` is `std::nullopt`, we are in a + // special case were only delta triples are inserted (which is only used for + // `matchMode`s other than `MatchAll`). + AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); + if (rowIndexInBlockEnd == LocatedTriple::NO_ROW_INDEX && block.has_value()) { + rowIndexInBlockEnd = block.value().size(); + } AD_CONTRACT_CHECK(map_.contains(blockIndex)); - AD_CONTRACT_CHECK(block.numColumns() == 1 || block.numColumns() == 2); + if (block.has_value()) { + AD_CONTRACT_CHECK(rowIndexInBlockBegin < block.value().size()); + AD_CONTRACT_CHECK(rowIndexInBlockEnd <= block.value().size()); + } + AD_CONTRACT_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 1); + AD_CONTRACT_CHECK(result.numColumns() == 1); + } else { + AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 2); + AD_CONTRACT_CHECK(result.numColumns() == 2); + } - AD_CONTRACT_CHECK(block.numColumns() == result.numColumns()); auto resultEntry = result.begin() + offsetInResult; const auto& locatedTriples = map_.at(blockIndex); auto locatedTriple = locatedTriples.begin(); @@ -97,73 +116,85 @@ void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, // Skip located triples that come before `offsetOfBlock` because this may be a // partial block. while (locatedTriple != locatedTriples.end() && - locatedTriple->rowIndexInBlock < offsetOfBlock) { + locatedTriple->rowIndexInBlock < rowIndexInBlockBegin) { ++locatedTriple; } - // Iterate over the input block. Keep track of the row index, which is - // `offsetInBlock` for the first element of the block. - size_t rowIndex = offsetOfBlock; - for (const auto& blockEntry : block) { - // Append triples that are marked for insertion at this position to the + // Iterate over the specified part of `block`. In the special case where + // `block` is `std::nullopt`, just insert the delta triples at the beginning, + // which all have `NO_ROW_INDEX`. + if (!block.has_value()) { + rowIndexInBlockBegin = LocatedTriple::NO_ROW_INDEX; + rowIndexInBlockEnd = rowIndexInBlockBegin + 1; + AD_CORRECTNESS_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + } + for (size_t rowIndex = rowIndexInBlockBegin; rowIndex < rowIndexInBlockEnd; + ++rowIndex) { + // Append triples that are marked for insertion at this `rowIndex` to the // result. while (locatedTriple != locatedTriples.end() && locatedTriple->rowIndexInBlock == rowIndex && locatedTriple->existsInIndex == false) { if (locatedTripleMatches()) { - if (result.numColumns() == 2) { + if constexpr (matchMode == MatchMode::MatchId1AndId2) { + (*resultEntry)[0] = locatedTriple->id3; + } else { (*resultEntry)[0] = locatedTriple->id2; (*resultEntry)[1] = locatedTriple->id3; - } else { - (*resultEntry)[0] = locatedTriple->id3; } ++resultEntry; } ++locatedTriple; } - // Append the triple at this position to the result if and only if it is + // Append the triple at this position to the result if and only if it is not // marked for deletion and matches (also skip it if it doesn't match). - bool deleteBlockEntry = false; + bool deleteThisEntry = false; if (locatedTriple != locatedTriples.end() && locatedTriple->rowIndexInBlock == rowIndex && locatedTriple->existsInIndex == true) { - deleteBlockEntry = locatedTripleMatches(); + deleteThisEntry = locatedTripleMatches(); ++locatedTriple; } - if (!deleteBlockEntry) { - *resultEntry++ = blockEntry; + if (block.has_value() && !deleteThisEntry) { + *resultEntry++ = block.value()[rowIndex]; } - - // Update `rowIndex` for the next `blockEntry`. - ++rowIndex; }; + + // Return the number of rows written to `result`. + return resultEntry - (result.begin() + offsetInResult); } // ____________________________________________________________________________ -void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, - const IdTable& block, IdTable& result, - size_t offsetInResult) const { - mergeTriples(blockIndex, block, result, offsetInResult); +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult) const { + return mergeTriples(blockIndex, std::move(block), result, + offsetInResult); } // ____________________________________________________________________________ -void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, - const IdTable& block, IdTable& result, - size_t offsetInResult, - size_t rowIndexOffset, Id id1) const { - mergeTriples(blockIndex, block, result, offsetInResult, - rowIndexOffset, id1); +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, + Id::makeUndefined(), rowIndexInBlockBegin); } // ____________________________________________________________________________ -void LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, - const IdTable& block, IdTable& result, - size_t offsetInResult, - size_t rowIndexOffset, Id id1, - Id id2) const { - mergeTriples( - blockIndex, block, result, offsetInResult, rowIndexOffset, id1, id2); +size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, + std::optional block, + IdTable& result, + size_t offsetInResult, Id id1, + Id id2, size_t rowIndexInBlockBegin, + size_t rowIndexInBlockEnd) const { + return mergeTriples( + blockIndex, std::move(block), result, offsetInResult, id1, id2, + rowIndexInBlockBegin, rowIndexInBlockEnd); } // ____________________________________________________________________________ @@ -199,15 +230,13 @@ LocatedTriple LocatedTriple::locateTripleInPermutation( size_t blockIndex = matchingBlock - blocks.begin(); // Preliminary `FindTripleResult` object with the correct `blockIndex` and - // IDs, but still an invalid `rowIndexInBlock` and `existsInIndex` set to - // `false`. - LocatedTriple locatedTriple{ - blockIndex, std::numeric_limits::max(), id1, id2, id3, false}; + // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set + // to `false`. + LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; - // If all IDs from all blocks are smaller, we return the index of the last - // block plus one (typical "end" semantics) and any position in the block - // (in the code that uses the result, that position will not be used in - // this case). + // If all `Id`s from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and `NO_ROW_INDEX` (see above and + // how this is considered in `mergeTriples`). if (matchingBlock == blocks.end()) { AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); return locatedTriple; @@ -286,6 +315,34 @@ LocatedTriple LocatedTriple::locateTripleInPermutation( return locatedTriple; } +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { + os << "LT(" << lt.blockIndex << " " + << (lt.rowIndexInBlock == LocatedTriple::NO_ROW_INDEX + ? "NO_ROW_INDEX" + : std::to_string(lt.rowIndexInBlock)) + << " " << lt.id1 << " " << lt.id2 << " " << lt.id3 << " " + << lt.existsInIndex << ")"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts) { + os << "{"; + std::copy(lts.begin(), lts.end(), + std::ostream_iterator(std::cout, " ")); + os << "}"; + return os; +} + +// ____________________________________________________________________________ +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { + for (auto [blockIndex, lts] : ltpb.map_) { + os << "Block #" << blockIndex << ": " << lts << std::endl; + } + return os; +} + // Explicit instantiation for the six permutation. #define INSTANTIATE_LTIP(Permutation) \ template LocatedTriple \ diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index 08c0b014aa..426fc2bdc7 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -33,6 +33,10 @@ struct LocatedTriple { template static LocatedTriple locateTripleInPermutation( Id id1, Id id2, Id id3, const Permutation& permutation); + + // Special row index for triples that belong to previous block. It is + // important that this value plus one is actually greater. + static const size_t NO_ROW_INDEX = std::numeric_limits::max() - 1; }; // A sorted set of triples located at the same position in a particular @@ -65,26 +69,31 @@ class LocatedTriplesPerBlock { std::pair numTriples(size_t blockIndex, Id id1) const; std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; - // Merge the located triples for `blockIndex` into the given `block` (which - // might be the whole block with that index or just a part of it) and write - // the result to `result`, starting from position `offsetInResult`. + // Merge the located triples for `blockIndex` into the given `blockPart` and + // write the result to `result`, starting from position `offsetInResult`. If + // `blockPart` is a whole index block, `offsetInBlock` is zero, otherwise it's + // the offset in the full block, where the part starts. // - // It is the resposibility of the caller that there is enough space or the + // It is the resposibility of the caller that there is enough space for the // result starting from that offset. Like for `numTriplesInBlock` above, // consider only triples that match `id1` (if provided) and `id2` (if - // provided). If `block` is just a part of an index block, the first triple of - // block has row index `rowIndexOffset` in the original block. + // provided). // - // TODO: Beware of triples inserted at the end of the block, they are found in - // the `LocatedTriples` for `blockIndex + 1`. It's up to `CompressedRelation` - // to handle that correctly. - void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, - size_t offsetInResult) const; - void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, - size_t offsetInResult, size_t rowIndexOffset, Id id1) const; - void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, - size_t offsetInResult, size_t rowIndexOffset, Id id1, - Id id2) const; + // In the special case where `block == std::nullopt`, we are just inserting + // the located triples for block `blockIndex` where the `rowIndexInBlock` is + // `NO_ROW_INDEX`. These actually belong to the previous block, but were + // larger than all triples there. + // + // Returns the number of rows written to `result`. + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult) const; + size_t mergeTriples(size_t blockIndex, std::optional block, + IdTable& result, size_t offsetInResult, Id id1, + size_t rowIndexInBlockBegin = 0) const; + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1, Id id2, size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. // Returns a handle to where it was added (via which we can easily remove it @@ -124,8 +133,15 @@ class LocatedTriplesPerBlock { // The only reason that the arguments `id1` and `id2` come at the end here is // so that we can give them default values. template - void mergeTriples(size_t blockIndex, const IdTable& block, IdTable& result, - size_t offsetInResult, size_t rowIndexOffset = 0, - Id id1 = Id::makeUndefined(), - Id id2 = Id::makeUndefined()) const; + size_t mergeTriples( + size_t blockIndex, std::optional block, IdTable& result, + size_t offsetInResult, Id id1 = Id::makeUndefined(), + Id id2 = Id::makeUndefined(), size_t rowIndexInBlockBegin = 0, + size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; }; + +// Human-readable representation of `LocatedTriple`, `LocatedTriples`, and +// `LocatedTriplesPerBlock` that are very useful for debugging. +std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); +std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); +std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 254600d6a3..3b8c9a31e0 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -103,9 +103,9 @@ TEST_F(LocatedTriplesTest, mergeTriples) { {20, 10}, // Row 4 {21, 11}, // Row 5 {30, 10}}); // Row 6 - IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + IdTable result(2, ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, block, result, 0); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0); ASSERT_EQ(result, resultExpected); } @@ -120,163 +120,184 @@ TEST_F(LocatedTriplesTest, mergeTriples) { {21, 11}, // Row 5 {30, 10}, // Row 6 {30, 30}}); // Row 7 - IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + IdTable result(2, ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, block, result, 0, 0, V(2)); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2)); ASSERT_EQ(result, resultExpected); } // Repeat but with a partial block that leaves out the first two elements of - // `block` (and correspondingly `offsetOfBlock == 2` in `mergeTriples`). + // `block`. { - IdTable blockTruncated = block.clone(); - std::shift_left(blockTruncated.begin(), blockTruncated.end(), 2); - blockTruncated.resize(block.size() - 2); IdTable resultExpected = makeIdTableFromVector({{15, 30}, // Row 0 {20, 10}, // Row 1 {21, 11}, // Row 2 {30, 10}, // Row 3 {30, 30}}); // Row 4 - IdTable result(block.numColumns(), ad_utility::testing::makeAllocator()); + IdTable result(2, ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 2, V(2)); + locatedTriplesPerBlock.mergeTriples(1, block.clone(), result, 0, V(2), 2); ASSERT_EQ(result, resultExpected); } // Merge only the triples with `id1 == V(2)` and `id2 == V(30)` into the // corresponding partial block (one triple inserted, one triple deleted). - // - // TODO: I don't think this case can actually occur in our code. When `id1` - // and `id2` are specified, we are only interesting in `id3` for result. { - IdTable blockTruncated = makeIdTableFromVector({{30, 20}, {30, 30}}); - IdTable resultExpected = makeIdTableFromVector({{30, 10}, {30, 30}}); - IdTable result(blockTruncated.numColumns(), - ad_utility::testing::makeAllocator()); + IdTable blockColumnId3(1, ad_utility::testing::makeAllocator()); + blockColumnId3.resize(block.size()); + for (size_t i = 0; i < block.size(); ++i) { + blockColumnId3(i, 0) = block(i, 1); + } + IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); + IdTable result(1, ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 4, V(2), - V(30)); + locatedTriplesPerBlock.mergeTriples(1, std::move(blockColumnId3), result, 0, + V(2), V(30), 4, 6); ASSERT_EQ(result, resultExpected); } - // Same, but only with the last column. + // Merge special triples. { - IdTable blockTruncated = makeIdTableFromVector({{20}, {30}}); - IdTable resultExpected = makeIdTableFromVector({{10}, {30}}); - IdTable result(blockTruncated.numColumns(), - ad_utility::testing::makeAllocator()); + size_t NRI = LocatedTriple::NO_ROW_INDEX; + auto locatedTriplesPerBlock = makeLocatedTriplesPerBlock( + {LocatedTriple{2, NRI, V(1), V(30), V(40), true}, + LocatedTriple{2, NRI, V(1), V(30), V(50), true}, + LocatedTriple{2, NRI, V(1), V(40), V(10), true}}); + IdTable resultExpected = makeIdTableFromVector({{30, 40}, // Row 0 + {30, 50}, // Row 1 + {40, 10}}); // Row 2 + IdTable result(2, ad_utility::testing::makeAllocator()); result.resize(resultExpected.size()); - locatedTriplesPerBlock.mergeTriples(1, blockTruncated, result, 0, 4, V(2), - V(30)); - ASSERT_EQ(result, resultExpected); + locatedTriplesPerBlock.mergeTriples(2, std::nullopt, result, 0, V(1)); } } // Test `Permutation::scan` (and hence also `CompressedRelation::scan`) with // triples merged from a `locatedTriplesPerBlock` object. TEST_F(LocatedTriplesTest, scanWithMergeTriples) { - // TODO: Test with multiple block sizes. - size_t blockSizeInBytes = 32; - std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; - std::string permutationFilename = basename + ".index.pso"; + // The actual test, for a given block size. + auto testWithGivenBlockSize = [](const size_t blockSizeInBytes) { + std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; + std::string permutationFilename = basename + ".index.pso"; - // Helper lambda for creating a `BufferedIdTable` (which we need for - // `CompressedRelationWriter` from an ordinary `IdTable` with two columns). - // - // TODO: Something like this is also used in `CompressedRelationsTest`, so it - // should be in a helper class. - auto getBufferedIdTable = [](const IdTable& idTable) -> BufferedIdTable { - // Note that these files are never created because we set the threshold for - // writing to disk so large. - std::string bufferFilename1 = "compressedRelationWriter.buffer1.dat"; - std::string bufferFilename2 = "compressedRelationWriter.buffer2.dat"; - AD_CONTRACT_CHECK(idTable.numColumns() == 2); - BufferedIdTable bufferedIdTable{ - 2, - std::array{ad_utility::BufferedVector{ - std::numeric_limits::max(), bufferFilename1}, - ad_utility::BufferedVector{ - std::numeric_limits::max(), bufferFilename2}}}; - for (size_t i = 0; i < idTable.size(); ++i) { - bufferedIdTable.push_back({idTable(i, 0), idTable(i, 1)}); - } - return bufferedIdTable; - }; + // Helper lambda for creating a `BufferedIdTable` (which we need for + // `CompressedRelationWriter` from an ordinary `IdTable` with two columns). + // + // TODO: Something like this is also used in `CompressedRelationsTest`, so + // it should be in a helper class. + auto getBufferedIdTable = [](const IdTable& idTable) -> BufferedIdTable { + // Note that these files are never created because we set the threshold + // for writing to disk so large. + std::string bufferFilename1 = "compressedRelationWriter.buffer1.dat"; + std::string bufferFilename2 = "compressedRelationWriter.buffer2.dat"; + AD_CONTRACT_CHECK(idTable.numColumns() == 2); + BufferedIdTable bufferedIdTable{ + 2, + std::array{ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename1}, + ad_utility::BufferedVector{ + std::numeric_limits::max(), bufferFilename2}}}; + for (size_t i = 0; i < idTable.size(); ++i) { + bufferedIdTable.push_back({idTable(i, 0), idTable(i, 1)}); + } + return bufferedIdTable; + }; - // Our test relation. - Id relationId = V(1); - IdTable relation = makeIdTableFromVector({{10, 10}, // Row 0 - {15, 20}, // Row 1 - {15, 30}, // Row 2 - {20, 10}, // Row 3 - {30, 20}, // Row 4 - {30, 30}}); // Row 5 + // Our test relation. + Id relationId = V(1); + IdTable relation = makeIdTableFromVector({{10, 10}, // Row 0 + {15, 20}, // Row 1 + {15, 30}, // Row 2 + {20, 10}, // Row 3 + {30, 20}, // Row 4 + {30, 30}}); // Row 5 - // Write it to disk (adapted from `CompressedRelationsTest`). The last value - // of the call to `addRelation` is the number of distinct elements. - ad_utility::File permutationFileForWritingRelations{permutationFilename, "w"}; - CompressedRelationWriter writer{std::move(permutationFileForWritingRelations), - blockSizeInBytes}; - writer.addRelation(relationId, getBufferedIdTable(relation), relation.size()); - writer.finish(); - auto metadataPerRelation = writer.getFinishedMetaData(); - auto metadataPerBlock = writer.getFinishedBlocks(); - AD_CORRECTNESS_CHECK(metadataPerRelation.size() == 1); + // Write it to disk (adapted from `CompressedRelationsTest`). The last value + // of the call to `addRelation` is the number of distinct elements. + ad_utility::File permutationFileForWritingRelations{permutationFilename, + "w"}; + CompressedRelationWriter writer{ + std::move(permutationFileForWritingRelations), blockSizeInBytes}; + writer.addRelation(relationId, getBufferedIdTable(relation), + relation.size()); + writer.finish(); + auto metadataPerRelation = writer.getFinishedMetaData(); + auto metadataPerBlock = writer.getFinishedBlocks(); + AD_CORRECTNESS_CHECK(metadataPerRelation.size() == 1); - // Append the metadata to the index file. - IndexMetaDataHmap metadata; - std::ranges::for_each(metadataPerRelation, - [&metadata](auto& md) { metadata.add(md); }); - metadata.blockData() = metadataPerBlock; - ad_utility::File permutationFileForWritingMetadata{permutationFilename, "r+"}; - metadata.appendToFile(&permutationFileForWritingMetadata); - permutationFileForWritingMetadata.close(); + // Append the metadata to the index file. + IndexMetaDataHmap metadata; + std::ranges::for_each(metadataPerRelation, + [&metadata](auto& md) { metadata.add(md); }); + metadata.blockData() = metadataPerBlock; + ad_utility::File permutationFileForWritingMetadata{permutationFilename, + "r+"}; + metadata.appendToFile(&permutationFileForWritingMetadata); + permutationFileForWritingMetadata.close(); - // Create a permutation based on this. - LocatedTriplesPerBlock locatedTriplesPerBlock; - Permutation::PermutationImpl permutation{ - SortByPSO(), "PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; - permutation.loadFromDisk(basename); - // ad_utility::File permutationFileForReading{permutationFilename, "r"}; - // permutation._file = std::move(permutationFileForReading); - // permutation._meta = metadata; - // permutation._isLoaded = true; + // Create a permutation based on this. + LocatedTriplesPerBlock locatedTriplesPerBlock; + Permutation::PermutationImpl permutation{ + SortByPSO(), "PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; + permutation.loadFromDisk(basename); + // ad_utility::File permutationFileForReading{permutationFilename, "r"}; + // permutation._file = std::move(permutationFileForReading); + // permutation._meta = metadata; + // permutation._isLoaded = true; - // Read the (for this test: first and only) relation from disk and check that - // it is the same. - IdTable result(relation.numColumns(), ad_utility::testing::makeAllocator()); - permutation.scan(relationId, &result); - // CompressedRelationReader reader; - // reader.scan(metadataPerRelation[0], metadataPerBlock, permutation._file, - // &result, ad_utility::SharedConcurrentTimeoutTimer{}); - ASSERT_EQ(result, relation); + // Read the (for this test: first and only) relation from disk and check + // that it is the same. + { + IdTable result(2, ad_utility::testing::makeAllocator()); + permutation.scan(relationId, &result); + ASSERT_EQ(result, relation); + } - // Helper lambda for adding to `locatedTriplesPerBlock`. - auto locatedTriplesPerBlockAdd = [&locatedTriplesPerBlock, &relationId, - &permutation](Id id2, Id id3) { - locatedTriplesPerBlock.add(LocatedTriple::locateTripleInPermutation( - relationId, id2, id3, permutation)); - }; + // Helper lambda for adding to `locatedTriplesPerBlock`. + auto locatedTriplesPerBlockAdd = [&locatedTriplesPerBlock, &relationId, + &permutation](Id id2, Id id3) { + locatedTriplesPerBlock.add(LocatedTriple::locateTripleInPermutation( + relationId, id2, id3, permutation)); + }; + + // Again, but with some located triples merged (three inserts, four + // deletes). + locatedTriplesPerBlockAdd(V(15), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(14), V(20)); // Insert. + locatedTriplesPerBlockAdd(V(20), V(10)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(20)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(30)); // Delete. + locatedTriplesPerBlockAdd(V(30), V(31)); // Insert at very end. + locatedTriplesPerBlockAdd(V(30), V(32)); // Insert at very end. + std::cout << locatedTriplesPerBlock; + { + IdTable result(2, ad_utility::testing::makeAllocator()); + permutation.scan(relationId, &result); + IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 + {14, 20}, // Row 1 + {15, 30}, // Row 2 + {30, 31}, // Row 3 + {30, 32}}); // Row 4 + ASSERT_EQ(result, resultExpected); + } - // Again, but with some located triples merged (three inserts, four deletes). - locatedTriplesPerBlockAdd(V(15), V(20)); // Delete. - locatedTriplesPerBlockAdd(V(14), V(20)); // Insert. - locatedTriplesPerBlockAdd(V(20), V(10)); // Delete. - locatedTriplesPerBlockAdd(V(30), V(20)); // Delete. - locatedTriplesPerBlockAdd(V(30), V(30)); // Delete. - locatedTriplesPerBlockAdd(V(30), V(31)); // Insert at very end. - locatedTriplesPerBlockAdd(V(30), V(32)); // Insert at very end. - permutation.scan(relationId, &result); - // reader.scan(metadataPerRelation[0], metadataPerBlock, permutation._file, - // &result, ad_utility::SharedConcurrentTimeoutTimer{}, - // locatedTriplesPerBlock); - IdTable resultExpected = makeIdTableFromVector({{10, 10}, // Row 0 - {14, 20}, // Row 1 - {15, 30}, // Row 2 - {30, 31}, // Row 3 - {30, 32}}); // Row 4 - ASSERT_EQ(result, resultExpected); + // Now a scan where two `Id`s are fixed. + { + IdTable result(1, ad_utility::testing::makeAllocator()); + result.resize(2); + permutation.scan(relationId, V(30), &result); + IdTable resultExpected = makeIdTableFromVector({{31}, {32}}); + ASSERT_EQ(result, resultExpected); + } + + // Delete the file with the compressed relations. + ad_utility::deleteFile(permutationFilename); + }; - // Delete the file with the compressed relations. - ad_utility::deleteFile(permutationFilename); + // Now test for multiple block sizes (16 bytes is the minimum). + testWithGivenBlockSize(16); + testWithGivenBlockSize(32); + testWithGivenBlockSize(48); + testWithGivenBlockSize(64); + testWithGivenBlockSize(100'000); } From 38ca1fd8cb4666114254326024bd2746bf2f3b2a Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 4 Apr 2023 22:26:15 +0200 Subject: [PATCH 13/20] Delta triples now considered for all blocks There was still one case missing: the possibly incomplete bloc at the beginning when only a single `Id` is fixed. Now delta tiples ae also considered for these blocks. TOOD: The unit test works for a permutation with a single relation, but there is still a problem when there are multiple relations. --- src/index/CompressedRelation.cpp | 149 ++++++++++++++++++------------- test/LocatedTriplesTest.cpp | 29 +++--- 2 files changed, 106 insertions(+), 72 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 3de1184102..8000b4759c 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -87,14 +87,6 @@ void CompressedRelationReader::scan( AD_CORRECTNESS_CHECK(numDelTotal < metadataForRelation.getNofElements()); } - // TODO: For now only consider delta triples in complete blocks. - if (firstBlockIsIncomplete) { - AD_CORRECTNESS_CHECK(numInsAndDelPerBlock.size() == 2); - numInsTotal = 0; - numDelTotal = 0; - LOG(WARN) << "Delta triples in incomplete block ignored!" << std::endl; - } - // The total size of the result is now known. result->resize(metadataForRelation.getNofElements() + numInsTotal - numDelTotal); @@ -110,49 +102,81 @@ void CompressedRelationReader::scan( // We have at most one block that is incomplete and thus requires trimming. // Set up a lambda, that reads this block and decompresses it to // the result. - auto readIncompleteBlock = [&](const auto& block) { + auto processIncompleteBlock = [&](const auto& blockMetadata) { // A block is uniquely identified by its start position in the file. // // NOTE: We read these blocks via a cache in order to speed up the unit - // tests (which make many requests to the same block, so we don't want to - // decompress it again and again). - auto cacheKey = block._offsetsAndCompressedSize.at(0)._offsetInFile; - auto uncompressedBuffer = blockCache_ - .computeOnce(cacheKey, - [&]() { - return readAndDecompressBlock( - block, file, std::nullopt); - }) - ._resultPointer; - - // Extract the part of the block that actually belongs to the relation - auto numElements = metadataForRelation._numRows; - AD_CORRECTNESS_CHECK(uncompressedBuffer->numColumns() == + // tests (which make many requests to the same block, so we don't want + // to decompress it again and again). + auto cacheKey = + blockMetadata->_offsetsAndCompressedSize.at(0)._offsetInFile; + auto block = blockCache_ + .computeOnce(cacheKey, + [&]() { + return readAndDecompressBlock( + *blockMetadata, file, std::nullopt); + }) + ._resultPointer; + + // Determine (via the metadata for the relation), exactly which part of the + // block belongs to the relation. + auto numInsAndDel = numInsAndDelPerBlock.at(blockMetadata - beginBlock); + size_t rowIndexBegin = metadataForRelation._offsetInBlock; + size_t rowIndexEnd = rowIndexBegin + metadataForRelation._numRows; + AD_CORRECTNESS_CHECK(rowIndexBegin < block->size()); + AD_CORRECTNESS_CHECK(rowIndexEnd <= block->size()); + AD_CORRECTNESS_CHECK(block->numColumns() == metadataForRelation.numColumns()); - for (size_t i = 0; i < uncompressedBuffer->numColumns(); ++i) { - const auto& inputCol = uncompressedBuffer->getColumn(i); - auto begin = inputCol.begin() + metadataForRelation._offsetInBlock; - auto resultColumn = result->getColumn(i); - AD_CORRECTNESS_CHECK(numElements <= spaceLeft); - std::copy(begin, begin + numElements, resultColumn.begin()); + size_t numRowsWrittenToResult = rowIndexEnd - rowIndexBegin; + + // Without delta triples, just copy the part of the block to `result`. + // Otherwise use `mergeTriples`. + if (numInsAndDel == std::pair{0, 0}) { + for (size_t i = 0; i < numRowsWrittenToResult; ++i) { + (*result)(offsetInResult + i, 0) = (*block)(rowIndexBegin + i, 0); + (*result)(offsetInResult + i, 1) = (*block)(rowIndexBegin + i, 1); + } + } else { + // TODO: First copy `*block` to an object of class `IdTable`. This copy + // would be avoidable, see the related comment in `getBlockPart` in the + // other `CompressedRelationReader::scan` below. + ad_utility::AllocatorWithLimit allocator{ + ad_utility::makeAllocationMemoryLeftThreadsafeObject( + std::numeric_limits::max())}; + IdTable blockAsIdTable(2, allocator); + blockAsIdTable.resize(block->size()); + for (size_t i = 0; i < block->size(); ++i) { + blockAsIdTable(i, 0) = (*block)(i, 0); + blockAsIdTable(i, 1) = (*block)(i, 1); + } + // Now call `mergeTriples` on `blockAsIdTable`. + size_t blockIndex = blockMetadata - metadataForAllBlocks.begin(); + size_t numRowsWrittenExpected = numRowsWrittenToResult; + numRowsWrittenExpected += numInsAndDel.first - numInsAndDel.second; + numRowsWrittenToResult = locatedTriplesPerBlock.mergeTriples( + blockIndex, std::move(blockAsIdTable), *result, offsetInResult, + col0Id, rowIndexBegin); + AD_CORRECTNESS_CHECK(numRowsWrittenToResult == numRowsWrittenExpected); } - offsetInResult += numElements; - spaceLeft -= numElements; + + AD_CORRECTNESS_CHECK(numRowsWrittenToResult <= spaceLeft); + offsetInResult += numRowsWrittenToResult; + spaceLeft -= numRowsWrittenToResult; }; // Read the first block if it is incomplete auto completeBlocksBegin = beginBlock; auto completeBlocksEnd = endBlock; if (firstBlockIsIncomplete) { - readIncompleteBlock(*beginBlock); + processIncompleteBlock(beginBlock); ++completeBlocksBegin; if (timer) { timer->wlock()->checkTimeoutAndThrow("IndexScan :"); } } - // Process all the other (complete) blocks. The compressed blocks are read - // sequentially from disk and then decompressed in parallel. + // Process all the other (complete) blocks. The compressed blocks are + // read sequentially from disk and then decompressed in parallel. if (completeBlocksBegin < completeBlocksEnd) { #pragma omp parallel #pragma omp single @@ -244,8 +268,8 @@ void CompressedRelationReader::scan( KeyLhs{col0Id, col0Id, col1Id, col1Id}, comp); // Compute the number of inserted and deleted triples per block and overall. - // note the `<=` so that we don't forget the block beyond the last (which may - // have information about delta triples at the vey end of a relation). + // note the `<=` so that we don't forget the block beyond the last (which + // may have information about delta triples at the vey end of a relation). std::vector> numInsAndDelPerBlock; size_t numInsTotal = 0; size_t numDelTotal = 0; @@ -275,8 +299,9 @@ void CompressedRelationReader::scan( AD_CORRECTNESS_CHECK(endBlock - beginBlock <= 1); } - // Helper class for a part of a block (needed for the first and last block in - // the following). These are small objects, so an unlimited allocator is OK. + // Helper class for a part of a block (needed for the first and last block + // in the following). These are small objects, so an unlimited allocator is + // OK. struct BlockPart { std::unique_ptr idTable = nullptr; size_t rowIndexBegin = 0; @@ -291,9 +316,9 @@ void CompressedRelationReader::scan( // index) begin and end index in the original block. // // NOTE: This is used for the first and last block below because these may - // contain triples that do not match `col0Id` and `col1Id`. We cannot directly - // merge these into `result` because we first need to know its total size and - // resize it before we can write to it. + // contain triples that do not match `col0Id` and `col1Id`. We cannot + // directly merge these into `result` because we first need to know its + // total size and resize it before we can write to it. auto getBlockPart = [&](auto blockMetadata) -> BlockPart { DecompressedBlock block = readAndDecompressBlock(*blockMetadata, file, std::nullopt); @@ -337,7 +362,8 @@ void CompressedRelationReader::scan( // Copy `block` to an `IdTable`. // // TODO: This is an unecessary copy. Extend the `IdTable` class so that we - // can move the data from the second column of `block` to `blockAsIdTable`. + // can move the data from the second column of `block` to + // `blockAsIdTable`. ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; @@ -492,8 +518,8 @@ float CompressedRelationWriter::computeMultiplicity( float multiplicity = functional ? 1.0f : static_cast(numElements) / float(numDistinctElements); - // Ensure that the multiplicity is only exactly 1.0 if the relation is indeed - // functional to prevent numerical instabilities; + // Ensure that the multiplicity is only exactly 1.0 if the relation is + // indeed functional to prevent numerical instabilities; if (!functional && multiplicity == 1.0f) [[unlikely]] { multiplicity = std::nextafter(1.0f, 2.0f); } @@ -656,11 +682,11 @@ void CompressedRelationReader::decompressBlockToExistingIdTable( IdTable& result, size_t offsetInResult, std::pair numInsAndDel, const LocatedTriplesPerBlock& locatedTriplesPerBlock, size_t blockIndex) { - // Check that the given arguments are consistent (they should always be, given - // that this method is `private`). - // LOG(INFO) << "numRowsToRead: " << numRowsToRead << std::endl; - // LOG(INFO) << "numInsAndDel.first: " << numInsAndDel.first << std::endl; - // LOG(INFO) << "numInsAndDel.second: " << numInsAndDel.second << std::endl; + // Check that the given arguments are consistent (they should always be, + // given that this method is `private`). LOG(INFO) << "numRowsToRead: " << + // numRowsToRead << std::endl; LOG(INFO) << "numInsAndDel.first: " << + // numInsAndDel.first << std::endl; LOG(INFO) << "numInsAndDel.second: " << + // numInsAndDel.second << std::endl; AD_CORRECTNESS_CHECK(numInsAndDel.second <= numRowsToRead); AD_CORRECTNESS_CHECK(result.numRows() + numInsAndDel.second >= offsetInResult + numRowsToRead + numInsAndDel.first); @@ -669,8 +695,9 @@ void CompressedRelationReader::decompressBlockToExistingIdTable( // Helper lambda that decompresses `numRowsToRead` from `compressedBlock` // to the given `IdTable` iterator. // - // TODO: It would be more natural to pass an `IdTable::iterator` here, but it - // seems that we can't get from that an iterator into an `IdTable` column. + // TODO: It would be more natural to pass an `IdTable::iterator` here, but + // it seems that we can't get from that an iterator into an `IdTable` + // column. // // TODO use zip_view. auto decompressToIdTable = [&compressedBlock, &numRowsToRead]( @@ -685,24 +712,24 @@ void CompressedRelationReader::decompressBlockToExistingIdTable( }; // If there are no delta triples for this block, just decompress directly to - // the `result` table. Otherwise decompress to an intermediate table and merge - // from there to `result`. + // the `result` table. Otherwise decompress to an intermediate table and + // merge from there to `result`. // // TODO: In the second case, we use an unlimited allocator for the space - // allocation for the intermediate table. This looks OK because our blocks are - // small, but it might be better to allocate also this table from the memory - // pool available to the server (to which we don't have acces here). + // allocation for the intermediate table. This looks OK because our blocks + // are small, but it might be better to allocate also this table from the + // memory pool available to the server (to which we don't have acces here). if (numInsAndDel == std::pair{0, 0}) { decompressToIdTable(result, offsetInResult); } else { ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; - IdTable decompressedBlock(compressedBlock.size(), allocator); - decompressedBlock.resize(numRowsToRead); - decompressToIdTable(decompressedBlock, 0); - locatedTriplesPerBlock.mergeTriples( - blockIndex, std::move(decompressedBlock), result, offsetInResult); + IdTable blockAsIdTable(compressedBlock.size(), allocator); + blockAsIdTable.resize(numRowsToRead); + decompressToIdTable(blockAsIdTable, 0); + locatedTriplesPerBlock.mergeTriples(blockIndex, std::move(blockAsIdTable), + result, offsetInResult); } } diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 3b8c9a31e0..7cd2bfbd54 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -176,7 +176,8 @@ TEST_F(LocatedTriplesTest, mergeTriples) { // triples merged from a `locatedTriplesPerBlock` object. TEST_F(LocatedTriplesTest, scanWithMergeTriples) { // The actual test, for a given block size. - auto testWithGivenBlockSize = [](const size_t blockSizeInBytes) { + auto testWithGivenBlockSize = [](const size_t blockSizeInBytes, + size_t numRelations, Id relationId) { std::string basename = "LocatedTriplesTest.scanWithMergeTriples"; std::string permutationFilename = basename + ".index.pso"; @@ -204,7 +205,6 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { }; // Our test relation. - Id relationId = V(1); IdTable relation = makeIdTableFromVector({{10, 10}, // Row 0 {15, 20}, // Row 1 {15, 30}, // Row 2 @@ -218,12 +218,13 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { "w"}; CompressedRelationWriter writer{ std::move(permutationFileForWritingRelations), blockSizeInBytes}; - writer.addRelation(relationId, getBufferedIdTable(relation), - relation.size()); + for (size_t i = 1; i <= numRelations; ++i) { + writer.addRelation(V(i), getBufferedIdTable(relation), relation.size()); + } writer.finish(); auto metadataPerRelation = writer.getFinishedMetaData(); auto metadataPerBlock = writer.getFinishedBlocks(); - AD_CORRECTNESS_CHECK(metadataPerRelation.size() == 1); + AD_CORRECTNESS_CHECK(metadataPerRelation.size() == numRelations); // Append the metadata to the index file. IndexMetaDataHmap metadata; @@ -294,10 +295,16 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { ad_utility::deleteFile(permutationFilename); }; - // Now test for multiple block sizes (16 bytes is the minimum). - testWithGivenBlockSize(16); - testWithGivenBlockSize(32); - testWithGivenBlockSize(48); - testWithGivenBlockSize(64); - testWithGivenBlockSize(100'000); + // Now test for multiple block sizes (16 bytes is the minimum), relation + // sizes, and relations. + // + // TODO: Currently fails if `numRelations > 1`. + size_t numRelations = 1; + for (size_t i = 1; i <= numRelations; ++i) { + testWithGivenBlockSize(16, numRelations, V(i)); + testWithGivenBlockSize(32, numRelations, V(i)); + testWithGivenBlockSize(48, numRelations, V(i)); + testWithGivenBlockSize(64, numRelations, V(i)); + testWithGivenBlockSize(100'000, numRelations, V(i)); + } } From 7a36a9d7533e789fe611b8a5debec06bee9a96d2 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Thu, 6 Apr 2023 14:58:09 +0200 Subject: [PATCH 14/20] Resolve conflict that git silently merged in the last commit --- src/index/CompressedRelation.h | 6 +----- test/DeltaTriplesTest.cpp | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 1a6360b532..0a57c41525 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -72,10 +72,6 @@ struct CompressedBlockMetadata { Id col1FirstId_; Id col1LastId_; - // For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we - // need to know the least significant `Id` of the last triple as well. - Id col2LastId_; - // For `DeltaTriples::findTripleInPermutation`, it helps to know the least // significant ID of the last triple as well. // @@ -84,7 +80,7 @@ struct CompressedBlockMetadata { // above either. It doesn't really harm though because the total size of the // blocks is small (even for Wikidata, we have only 50K block, and as you can // see from the members, a block consumes < 100 bytes). - Id _col2LastId; + Id col2LastId_; // Two of these are equal if all members are equal. bool operator==(const CompressedBlockMetadata&) const = default; diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 5e39c0ffde..3ae2cf17d1 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -344,7 +344,7 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { } else if (blockIndex > 0) { return IdTriple{metadataPerBlock[blockIndex - 1].col0LastId_, metadataPerBlock[blockIndex - 1].col1LastId_, - metadataPerBlock[blockIndex - 1]._col2LastId}; + metadataPerBlock[blockIndex - 1].col2LastId_}; } else { return IdTriple{Id::makeUndefined(), Id::makeUndefined(), Id::makeUndefined()}; From 3afe571c8943c3b0532be0f239a87d0534fad73c Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sat, 8 Apr 2023 18:08:29 +0200 Subject: [PATCH 15/20] Address bug pointed out by SonarCloud I am not convinced though that it was a bug. --- src/index/CompressedRelation.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 5b457249b0..7f8b1c0a55 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -367,13 +367,13 @@ void CompressedRelationReader::scan( ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; - IdTable result(1, allocator); - result.resize(block.size()); + std::unique_ptr result = std::make_unique(1, allocator); + result->resize(block.size()); for (size_t i = 0; i < block.size(); ++i) { - result(i, 0) = block(i, 1); + (*result)(i, 0) = block(i, 1); } - return {std::make_unique(std::move(result)), rowIndexBegin, - rowIndexEnd, blockIndex, numInsAndDel}; + return {std::move(result), rowIndexBegin, rowIndexEnd, blockIndex, + numInsAndDel}; }; // The first and the last block might be incomplete. We process them From 2f4b10528638c6f0ae8be873b24fe12b3ae486d2 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Tue, 11 Apr 2023 14:55:35 +0200 Subject: [PATCH 16/20] Revert "Address bug pointed out by SonarCloud" This reverts commit 3afe571c8943c3b0532be0f239a87d0534fad73c. --- src/index/CompressedRelation.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 7f8b1c0a55..5b457249b0 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -367,13 +367,13 @@ void CompressedRelationReader::scan( ad_utility::AllocatorWithLimit allocator{ ad_utility::makeAllocationMemoryLeftThreadsafeObject( std::numeric_limits::max())}; - std::unique_ptr result = std::make_unique(1, allocator); - result->resize(block.size()); + IdTable result(1, allocator); + result.resize(block.size()); for (size_t i = 0; i < block.size(); ++i) { - (*result)(i, 0) = block(i, 1); + result(i, 0) = block(i, 1); } - return {std::move(result), rowIndexBegin, rowIndexEnd, blockIndex, - numInsAndDel}; + return {std::make_unique(std::move(result)), rowIndexBegin, + rowIndexEnd, blockIndex, numInsAndDel}; }; // The first and the last block might be incomplete. We process them From ad1163cdb9ee38e1bb43178fa16e5a29b80731d9 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Thu, 8 Jun 2023 21:32:35 +0200 Subject: [PATCH 17/20] Make it compile again One test in `LocatedTriplesTest` still fails because writing the files for a PSO permutation and then reading from it no longer works as it did. I hope that Johannes or Julian can help me. --- src/index/CompressedRelation.h | 7 ++--- src/index/DeltaTriples.cpp | 16 ++++++----- src/index/DeltaTriples.h | 3 +- src/index/Index.h | 4 +-- src/index/IndexImpl.h | 12 ++++---- src/index/IndexMetaData.h | 3 +- src/index/LocatedTriples.cpp | 13 --------- src/index/LocatedTriples.h | 5 ++-- src/index/Permutations.h | 2 +- test/DeltaTriplesTest.cpp | 51 +++++++++++++++++----------------- test/LocatedTriplesTest.cpp | 45 +++++++++++++++--------------- 11 files changed, 75 insertions(+), 86 deletions(-) diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 76674bf84b..c2275ccc56 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -2,8 +2,7 @@ // Chair of Algorithms and Data Structures // Author: Johannes Kalmbach -#ifndef QLEVER_COMPRESSEDRELATION_H -#define QLEVER_COMPRESSEDRELATION_H +#pragma once #include #include @@ -11,7 +10,7 @@ #include "engine/idTable/IdTable.h" #include "global/Id.h" #include "index/ConstantsIndexBuilding.h" -#include "index/DeltaTriples.h" +#include "index/LocatedTriples.h" #include "util/BufferedVector.h" #include "util/Cache.h" #include "util/ConcurrentCache.h" @@ -328,5 +327,3 @@ class CompressedRelationReader { const CompressedBlockMetadata& blockMetaData, ad_utility::File& file, std::optional> columnIndices); }; - -#endif // QLEVER_COMPRESSEDRELATION_H diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 7d8c9a5783..35da7f88dd 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -139,19 +139,21 @@ void DeltaTriples::deleteTriple(TurtleTriple turtleTriple) { // ____________________________________________________________________________ const LocatedTriplesPerBlock& DeltaTriples::getTriplesWithPositionsPerBlock( - Index::Permutation permutation) const { + Permutation::Enum permutation) const { + // TODO: This `switch` would no longer be needed if the six + // locatedTriplesPerBlockIn... were a map with the permutation as key. switch (permutation) { - case Index::Permutation::PSO: + case Permutation::PSO: return locatedTriplesPerBlockInPSO_; - case Index::Permutation::POS: + case Permutation::POS: return locatedTriplesPerBlockInPOS_; - case Index::Permutation::SPO: + case Permutation::SPO: return locatedTriplesPerBlockInSPO_; - case Index::Permutation::SOP: + case Permutation::SOP: return locatedTriplesPerBlockInSOP_; - case Index::Permutation::OSP: + case Permutation::OSP: return locatedTriplesPerBlockInOSP_; - case Index::Permutation::OPS: + case Permutation::OPS: return locatedTriplesPerBlockInOPS_; default: AD_FAIL(); diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index a06f16dd79..8c3f3e7b88 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -9,6 +9,7 @@ #include "index/Index.h" #include "index/IndexBuilderTypes.h" #include "index/LocatedTriples.h" +#include "index/Permutations.h" #include "parser/TurtleParser.h" #include "util/HashSet.h" @@ -88,7 +89,7 @@ class DeltaTriples { // Get `TripleWithPosition` objects for given permutation. const LocatedTriplesPerBlock& getTriplesWithPositionsPerBlock( - Index::Permutation permutation) const; + Permutation::Enum permutation) const; // TODO: made public as long as we are trying to figure out how this works. private: diff --git a/src/index/Index.h b/src/index/Index.h index dd96d91b7e..7a884492f2 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -3,6 +3,7 @@ // Author: // 2014-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) + #pragma once #include @@ -48,9 +49,6 @@ class Index { std::unique_ptr pimpl_; public: - // Identifiers for the six possible permutations. - enum struct Permutation { PSO, POS, SPO, SOP, OPS, OSP }; - // Alongside the actual knowledge graph QLever stores additional triples // for optimized query processing. This struct is used to report various // statistics (number of triples, distinct number of subjects, etc.) for which diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index bc1122ea20..3c007f0440 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -169,32 +169,32 @@ class IndexImpl { "POS", ".pos", {1, 2, 0}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::POS)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::POS)}; Permutation pso_{ "PSO", ".pso", {1, 0, 2}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::PSO)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::PSO)}; Permutation sop_{ "SOP", ".sop", {0, 2, 1}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::SOP)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::SOP)}; Permutation spo_{ "SPO", ".spo", {0, 1, 2}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::SPO)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::SPO)}; Permutation ops_{ "OPS", ".ops", {2, 1, 0}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::OPS)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::OPS)}; Permutation osp_{ "OSP", ".osp", {2, 0, 1}, - deltaTriples_->getTriplesWithPositionsPerBlock(Index::Permutation::OSP)}; + deltaTriples_->getTriplesWithPositionsPerBlock(Permutation::OSP)}; public: IndexImpl(std::unique_ptr deltaTriples = diff --git a/src/index/IndexMetaData.h b/src/index/IndexMetaData.h index 9096ab110d..3039c0ba28 100644 --- a/src/index/IndexMetaData.h +++ b/src/index/IndexMetaData.h @@ -1,6 +1,7 @@ // Copyright 2015, University of Freiburg, // Chair of Algorithms and Data Structures. // Author: Björn Buchhold (buchhold@informatik.uni-freiburg.de) + #pragma once #include @@ -13,8 +14,8 @@ #include #include -#include "CompressedRelation.h" #include "global/Id.h" +#include "index/CompressedRelation.h" #include "index/MetaDataHandler.h" #include "util/File.h" #include "util/HashMap.h" diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 08e20f1e50..377398ef94 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -198,7 +198,6 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, } // ____________________________________________________________________________ -template LocatedTriple LocatedTriple::locateTripleInPermutation( Id id1, Id id2, Id id3, const Permutation& permutation) { // Get the internal data structures from the permutation. @@ -342,15 +341,3 @@ std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb) { } return os; } - -// Explicit instantiation for the six permutation. -#define INSTANTIATE_LTIP(Permutation) \ - template LocatedTriple \ - LocatedTriple::locateTripleInPermutation(Id, Id, Id, \ - const Permutation&); -INSTANTIATE_LTIP(Permutation::PSO_T) -INSTANTIATE_LTIP(Permutation::POS_T) -INSTANTIATE_LTIP(Permutation::SPO_T) -INSTANTIATE_LTIP(Permutation::SOP_T) -INSTANTIATE_LTIP(Permutation::OPS_T) -INSTANTIATE_LTIP(Permutation::OSP_T) diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index 426fc2bdc7..8a313621fe 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -2,11 +2,13 @@ // Chair of Algorithms and Data Structures // Authors: Hannah Bast +#pragma once + #include "engine/idTable/IdTable.h" #include "global/IdTriple.h" #include "util/HashMap.h" -#pragma once +class Permutation; // A triple and its location in a particular permutation. // @@ -30,7 +32,6 @@ struct LocatedTriple { bool existsInIndex; // Locate the given triple in the given permutation. - template static LocatedTriple locateTripleInPermutation( Id id1, Id id2, Id id3, const Permutation& permutation); diff --git a/src/index/Permutations.h b/src/index/Permutations.h index 0717ab7b58..e6319c2375 100644 --- a/src/index/Permutations.h +++ b/src/index/Permutations.h @@ -8,7 +8,7 @@ #include #include "global/Constants.h" -#include "index/DeltaTriples.h" +// #include "index/DeltaTriples.h" #include "index/IndexMetaData.h" #include "index/StxxlSortFunctors.h" #include "util/File.h" diff --git a/test/DeltaTriplesTest.cpp b/test/DeltaTriplesTest.cpp index 3ae2cf17d1..b91d8a6dc1 100644 --- a/test/DeltaTriplesTest.cpp +++ b/test/DeltaTriplesTest.cpp @@ -10,13 +10,14 @@ #include "engine/ExportQueryExecutionTrees.h" #include "index/DeltaTriples.h" #include "index/IndexImpl.h" +#include "index/Permutations.h" #include "parser/TurtleParser.h" // Shortcuts to these full type names used frequently in the following. // using IdTriple; -static const std::vector permutationEnums = { - Index::Permutation::PSO, Index::Permutation::POS, Index::Permutation::SPO, - Index::Permutation::SOP, Index::Permutation::OPS, Index::Permutation::OSP}; +static const std::vector permutationEnums = { + Permutation::PSO, Permutation::POS, Permutation::SPO, + Permutation::SOP, Permutation::OPS, Permutation::OSP}; // Fixture that sets up a test index. class DeltaTriplesTest : public ::testing::Test { @@ -102,7 +103,7 @@ class DeltaTriplesTest : public ::testing::Test { // number of `LocatedTriple` objects. void checkTriplesWithPositionsPerBlockSize(const DeltaTriples& deltaTriples, size_t expectedSize) { - for (Index::Permutation permutation : permutationEnums) { + for (Permutation::Enum permutation : permutationEnums) { ASSERT_EQ(deltaTriples.getTriplesWithPositionsPerBlock(permutation) .numTriples(), expectedSize); @@ -380,27 +381,27 @@ TEST_F(DeltaTriplesTest, insertAndDeleteTriples) { // Check that all `locatedTriple`s are correct (for all // permutations). the given permutation. - auto checkAllTriplesWithPositionForAllPermutations = [&](const DeltaTriples& - deltaTriples) { - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::POS), - index.getImpl().POS()); - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::PSO), - index.getImpl().PSO()); - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::SPO), - index.getImpl().SPO()); - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::SOP), - index.getImpl().SOP()); - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::OPS), - index.getImpl().OPS()); - checkAllTriplesWithPositionsForPermutation( - deltaTriples.getTriplesWithPositionsPerBlock(Index::Permutation::OSP), - index.getImpl().OSP()); - }; + auto checkAllTriplesWithPositionForAllPermutations = + [&](const DeltaTriples& deltaTriples) { + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::POS), + index.getImpl().POS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::PSO), + index.getImpl().PSO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::SPO), + index.getImpl().SPO()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::SOP), + index.getImpl().SOP()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::OPS), + index.getImpl().OPS()); + checkAllTriplesWithPositionsForPermutation( + deltaTriples.getTriplesWithPositionsPerBlock(Permutation::OSP), + index.getImpl().OSP()); + }; // Check if each existing triple is located correctly in every // permutation. diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 7cd2bfbd54..4dbb24e8a6 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -212,34 +212,35 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { {30, 20}, // Row 4 {30, 30}}); // Row 5 - // Write it to disk (adapted from `CompressedRelationsTest`). The last value - // of the call to `addRelation` is the number of distinct elements. + // Write the permutation to disk (adapted from `CompressedRelationsTest`, + // `IndexImpl::createPermutationPairImpl`, and `IndexImpl::). ad_utility::File permutationFileForWritingRelations{permutationFilename, "w"}; - CompressedRelationWriter writer{ - std::move(permutationFileForWritingRelations), blockSizeInBytes}; - for (size_t i = 1; i <= numRelations; ++i) { - writer.addRelation(V(i), getBufferedIdTable(relation), relation.size()); + IndexMetaDataMmap metadataMmap; + metadataMmap.setup(permutationFilename + MMAP_FILE_SUFFIX, + ad_utility::CreateTag{}); + { + CompressedRelationWriter writer{ + std::move(permutationFileForWritingRelations), blockSizeInBytes}; + for (size_t i = 1; i <= numRelations; ++i) { + // The third argument is the number of distinct elements. + auto relationMetadata = writer.addRelation( + V(i), getBufferedIdTable(relation), relation.size()); + metadataMmap.add(relationMetadata); + } + metadataMmap.blockData() = std::move(writer).getFinishedBlocks(); + } + std::cout << "Metadata statistics: " << metadataMmap.statistics() + << std::endl; + { + ad_utility::File permutationFileForWritingMetadata(permutationFilename, + "r+"); + metadataMmap.appendToFile(&permutationFileForWritingMetadata); } - writer.finish(); - auto metadataPerRelation = writer.getFinishedMetaData(); - auto metadataPerBlock = writer.getFinishedBlocks(); - AD_CORRECTNESS_CHECK(metadataPerRelation.size() == numRelations); - - // Append the metadata to the index file. - IndexMetaDataHmap metadata; - std::ranges::for_each(metadataPerRelation, - [&metadata](auto& md) { metadata.add(md); }); - metadata.blockData() = metadataPerBlock; - ad_utility::File permutationFileForWritingMetadata{permutationFilename, - "r+"}; - metadata.appendToFile(&permutationFileForWritingMetadata); - permutationFileForWritingMetadata.close(); // Create a permutation based on this. LocatedTriplesPerBlock locatedTriplesPerBlock; - Permutation::PermutationImpl permutation{ - SortByPSO(), "PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; + Permutation permutation{"PSO", ".pso", {1, 0, 2}, locatedTriplesPerBlock}; permutation.loadFromDisk(basename); // ad_utility::File permutationFileForReading{permutationFilename, "r"}; // permutation._file = std::move(permutationFileForReading); From b8817173c0c4dfe323b38bb59c9c838551afc94d Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 9 Jun 2023 12:00:35 +0200 Subject: [PATCH 18/20] Fix test with help of Johannes --- test/LocatedTriplesTest.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 4dbb24e8a6..15fe0fb089 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -214,12 +214,12 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { // Write the permutation to disk (adapted from `CompressedRelationsTest`, // `IndexImpl::createPermutationPairImpl`, and `IndexImpl::). - ad_utility::File permutationFileForWritingRelations{permutationFilename, - "w"}; - IndexMetaDataMmap metadataMmap; - metadataMmap.setup(permutationFilename + MMAP_FILE_SUFFIX, - ad_utility::CreateTag{}); { + ad_utility::File permutationFileForWritingRelations{permutationFilename, + "w"}; + IndexMetaDataMmap metadataMmap; + metadataMmap.setup(permutationFilename + MMAP_FILE_SUFFIX, + ad_utility::CreateTag{}); CompressedRelationWriter writer{ std::move(permutationFileForWritingRelations), blockSizeInBytes}; for (size_t i = 1; i <= numRelations; ++i) { @@ -229,10 +229,6 @@ TEST_F(LocatedTriplesTest, scanWithMergeTriples) { metadataMmap.add(relationMetadata); } metadataMmap.blockData() = std::move(writer).getFinishedBlocks(); - } - std::cout << "Metadata statistics: " << metadataMmap.statistics() - << std::endl; - { ad_utility::File permutationFileForWritingMetadata(permutationFilename, "r+"); metadataMmap.appendToFile(&permutationFileForWritingMetadata); From e88c29303fa9391d14f482a3e9e8e6fe453b5563 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 9 Jun 2023 12:06:51 +0200 Subject: [PATCH 19/20] Latest submodules --- third_party/abseil-cpp | 2 +- third_party/stxxl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/abseil-cpp b/third_party/abseil-cpp index 13708db87b..abe63eb9bd 160000 --- a/third_party/abseil-cpp +++ b/third_party/abseil-cpp @@ -1 +1 @@ -Subproject commit 13708db87b1ab69f4f2b3214f3f51e986546f282 +Subproject commit abe63eb9bd1213c018bf82765ab747334d3b33d8 diff --git a/third_party/stxxl b/third_party/stxxl index e8025eb4ed..3a56499dbb 160000 --- a/third_party/stxxl +++ b/third_party/stxxl @@ -1 +1 @@ -Subproject commit e8025eb4ede8c033bf64183c3e104e4cb0617271 +Subproject commit 3a56499dbbd1ce124546626f18e61ca0520df0c3 From 8910f61335bec7ffff8095141d0d982beff56c5f Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 9 Jun 2023 16:17:54 +0200 Subject: [PATCH 20/20] Some clean up before splitting off a first smaller PR --- src/index/DeltaTriples.h | 31 +--- src/index/LocatedTriples.cpp | 282 ++++++++++++++++++----------------- src/index/LocatedTriples.h | 118 ++++++++++----- test/LocatedTriplesTest.cpp | 2 +- 4 files changed, 232 insertions(+), 201 deletions(-) diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 8c3f3e7b88..715a42d0d8 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -17,7 +17,7 @@ // building, we call these delta triples. How it works in principle: // // 1. For each delta triple, find the location in each permutation (block index -// and index within that block, see end of the file for an exact definition). +// and index within that block, see `index/LocatedTriples.h`). // // 2. For each permutation and each block, store a sorted list of the positions // of the delta triples within that block. @@ -25,6 +25,8 @@ // 3. In the call of `PermutationImpl::scan`, use the respective lists to merge // the relevant delta tripless into the index scan result. // +// NOTE: The delta triples currently do not go well together with CACHING. See +// the discussion at the end of this file. class DeltaTriples { private: // The index to which these triples are added. @@ -121,9 +123,7 @@ class DeltaTriples { void eraseTripleInAllPermutations(LocatedTripleHandles& handles); }; -// More detailed discussion and information about the `DeltaTriples` class. -// -// A. DELTA TRIPLES AND THE CACHE +// DELTA TRIPLES AND THE CACHE // // For now, our approach only works when the results of index scans are not // cached (unless there are no relevant delta triples for a particular scan). @@ -140,26 +140,3 @@ class DeltaTriples { // store and maintain the positions in those uncompressed index scans. However, // this would only work for the results of index scans. For the results of more // complex subqueries, it's hard to figure out which delta triples are relevant. -// -// B. DEFINITION OF THE POSITION OF A DELTA TRIPLE IN A PERMUTATION -// -// 1. The position is defined by the index of a block in the permutation and the -// index of a row within that block. -// -// 2. If the triple in contained in the permutation, it is contained exactly -// once and so there is a well defined block and position in that block. -// -// 2. If there is a block, where the first triple is smaller and the last triple -// is larger, then that is the block and the position in that block is that of -// the first triple that is (not smaller and hence) larger. -// -// 3. If the triple falls "between two blocks" (the last triple of the previous -// block is smaller and the first triple of the next block is larger), then the -// position is the first position in that next block. -// -// 4. As a special case of 3., if the triple is smaller than all triples in the -// permutation, the position is the first position of the first block. -// -// 5. If the triple is larger than all triples in the permutation, the block -// index is one after the largest block index and the position within that -// non-existing block is arbitrary. diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 377398ef94..acd6988675 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -10,27 +10,139 @@ #include "index/IndexMetaData.h" #include "index/Permutations.h" +// ____________________________________________________________________________ +LocatedTriple LocatedTriple::locateTripleInPermutation( + Id id1, Id id2, Id id3, const Permutation& permutation) { + // Get the internal data structures from the permutation. + auto& file = permutation._file; + const auto& meta = permutation._meta; + const auto& reader = permutation._reader; + + // Find the index of the first block where the last triple is not smaller. + // + // NOTE: Since `_col2LastId` has been added to `CompressedBlockMetadata`, this + // can be computed without having to decompress any blocks. + const vector& blocks = meta.blockData(); + auto matchingBlock = std::lower_bound( + blocks.begin(), blocks.end(), std::array{id1, id2, id3}, + [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { + if (block.col0LastId_ < triple[0]) { + return true; + } else if (block.col0LastId_ == triple[0]) { + if (block.col1LastId_ < triple[1]) { + return true; + } else if (block.col1LastId_ == triple[1]) { + return block.col2LastId_ < triple[2]; + } + } + return false; + }); + size_t blockIndex = matchingBlock - blocks.begin(); + + // Preliminary `FindTripleResult` object with the correct `blockIndex` and + // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set + // to `false`. + LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; + + // If all `Id`s from all blocks are smaller, we return the index of the last + // block plus one (typical "end" semantics) and the special row index + // `NO_ROW_INDEX` (see how this is considered in `mergeTriples`). + if (matchingBlock == blocks.end()) { + AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); + return locatedTriple; + } + + // Read and decompress the block. + DecompressedBlock blockTuples = + reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); + + // Find the smallest relation `Id` that is not smaller than `id1` and get its + // metadata and the position of the first and last triple with that `Id` in + // the block. + // + // IMPORTANT: If relation `id1` exists in the index, but our triple is larger + // than all triples of that relation in the index and the last triple of that + // relation ends a block, then our block search above (correctly) landed us at + // the next block. We can detect this by checking whether the first relation + // `Id` of the block is larger than `id1` and then we should get the metadata + // for the `Id` and not for `id1` (which would pertain to a previous block). + // + // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, + // which is relevant in the rare case where a triple is inserted with an + // `Id` for predicate that is not a new `Id`, but has not been used for a + // predicate in the original index. + // + // NOTE: Since we have already handled the case, where all `Id`s in the + // permutation are smaller, above, such a relation should exist. + Id searchId = + matchingBlock->col0FirstId_ > id1 ? matchingBlock->col0FirstId_ : id1; + const auto& it = meta._data.lower_bound(searchId); + AD_CORRECTNESS_CHECK(it != meta._data.end()); + Id id = it.getId(); + const auto& relationMetadata = meta.getMetaData(id); + size_t offsetBegin = relationMetadata.offsetInBlock_; + size_t offsetEnd = offsetBegin + relationMetadata.numRows_; + // Note: If the relation spans multiple blocks, we know that the block we + // found above contains only triples from that relation. + if (offsetBegin == std::numeric_limits::max()) { + offsetBegin = 0; + offsetEnd = blockTuples.size(); + } + AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); + AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); + + // If we have found `id1`, we can do a binary search in the portion of the + // block that pertains to it (note the special case mentioned above, where + // we are already at the beginning of the next block). + // + // Otherwise, `id` is the next larger `Id` and the position of the first + // triple of that relation is exactly the position we are looking for. + if (id == id1) { + locatedTriple.rowIndexInBlock = + std::lower_bound(blockTuples.begin() + offsetBegin, + blockTuples.begin() + offsetEnd, + std::array{id2, id3}, + [](const auto& a, const auto& b) { + return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); + }) - + blockTuples.begin(); + // Check if the triple at the found position is equal to `id1 id2 id3`. + // Note that our default for `existsInIndex` was set to `false` above. + const size_t& i = locatedTriple.rowIndexInBlock; + AD_CORRECTNESS_CHECK(i < blockTuples.size()); + if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { + locatedTriple.existsInIndex = true; + } + } else { + AD_CORRECTNESS_CHECK(id1 < id); + locatedTriple.rowIndexInBlock = offsetBegin; + } + + // Return the result. + return locatedTriple; +} + // ____________________________________________________________________________ template std::pair LocatedTriplesPerBlock::numTriplesImpl( size_t blockIndex, Id id1, Id id2) const { - // If no located triples for `blockIndex` exist, there are no delta triples - // for that block. + // If no located triples for `blockIndex` exist, there is no entry in `map_`. if (!map_.contains(blockIndex)) { return {0, 0}; } - // Otherwise iterate over all entries and count. - size_t countInserted = 0; - size_t countDeleted = 0; + // Otherwise iterate over all located triples and count how many of them exist + // in the index ("to be deleted") and how many are new ("to be inserted"). + size_t countExists = 0; + size_t countNew = 0; for (const LocatedTriple& locatedTriple : map_.at(blockIndex)) { // Helper lambda for increasing the right counter. auto increaseCountIf = [&](bool increase) { if (increase) { if (locatedTriple.existsInIndex) { - ++countDeleted; + ++countExists; } else { - ++countInserted; + ++countNew; } } }; @@ -43,7 +155,7 @@ std::pair LocatedTriplesPerBlock::numTriplesImpl( increaseCountIf(locatedTriple.id1 == id1 && locatedTriple.id2 == id2); } } - return {countInserted, countDeleted}; + return {countNew, countExists}; } // ____________________________________________________________________________ @@ -73,22 +185,33 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, size_t offsetInResult, Id id1, Id id2, size_t rowIndexInBlockBegin, size_t rowIndexInBlockEnd) const { - // This method should only be called, if located triples in that block exist. - // The two `rowIndexInBlock`s should define a valid non-empty range. Both - // `block` and `result` must have one column for `MatchMode::MatchId1AndId2` - // and two columns otherwise. If `block` is `std::nullopt`, we are in a - // special case were only delta triples are inserted (which is only used for - // `matchMode`s other than `MatchAll`). + // This method should only be called if there are located triples in the + // specified block. + AD_CONTRACT_CHECK(map_.contains(blockIndex)); + + // The special case `block == std::nullopt` (write only located triples to + // `result`) is only allowed, when `id1` or `id1` and `id2` are specified. AD_CONTRACT_CHECK(block.has_value() || matchMode != MatchMode::MatchAll); + + // If `rowIndexInBlockEnd` has the default value (see `LocatedTriples.h`), the + // intended semantics is that we read the whole block (note that we can't have + // a default value that depends on the values of previous arguments). if (rowIndexInBlockEnd == LocatedTriple::NO_ROW_INDEX && block.has_value()) { rowIndexInBlockEnd = block.value().size(); } - AD_CONTRACT_CHECK(map_.contains(blockIndex)); + + // Check that `rowIndexInBlockBegin` and `rowIndexInBlockEnd` define a valid + // and non-emtpy range and that it is a subrange of `block` (unless the latter + // is `std::nullopt`). if (block.has_value()) { AD_CONTRACT_CHECK(rowIndexInBlockBegin < block.value().size()); AD_CONTRACT_CHECK(rowIndexInBlockEnd <= block.value().size()); } AD_CONTRACT_CHECK(rowIndexInBlockBegin < rowIndexInBlockEnd); + + // If we restrict `id1` and `id2`, the index block and the result must have + // one column (for the `id3`). Otherwise, they must have two columns (for the + // `id2` and the `id3`). if constexpr (matchMode == MatchMode::MatchId1AndId2) { AD_CONTRACT_CHECK(!block.has_value() || block.value().numColumns() == 1); AD_CONTRACT_CHECK(result.numColumns() == 1); @@ -113,16 +236,16 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, } }; - // Skip located triples that come before `offsetOfBlock` because this may be a - // partial block. + // Advance to the first located triple in the specified range. while (locatedTriple != locatedTriples.end() && locatedTriple->rowIndexInBlock < rowIndexInBlockBegin) { ++locatedTriple; } - // Iterate over the specified part of `block`. In the special case where - // `block` is `std::nullopt`, just insert the delta triples at the beginning, - // which all have `NO_ROW_INDEX`. + // Iterate over all located triples in the specified range. In the special + // case `block == std::nullopt` (only write located triples to `result`), all + // relevant located triples have `rowIndexInBlock == NO_ROW_INDEX` (here we + // need that `NO_ROW_INDEX` is the maximal `size_t` value minus one). if (!block.has_value()) { rowIndexInBlockBegin = LocatedTriple::NO_ROW_INDEX; rowIndexInBlockEnd = rowIndexInBlockBegin + 1; @@ -148,7 +271,7 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, } // Append the triple at this position to the result if and only if it is not - // marked for deletion and matches (also skip it if it doesn't match). + // marked for deletion and matches (also skip it if it does not match). bool deleteThisEntry = false; if (locatedTriple != locatedTriples.end() && locatedTriple->rowIndexInBlock == rowIndex && @@ -197,123 +320,6 @@ size_t LocatedTriplesPerBlock::mergeTriples(size_t blockIndex, rowIndexInBlockBegin, rowIndexInBlockEnd); } -// ____________________________________________________________________________ -LocatedTriple LocatedTriple::locateTripleInPermutation( - Id id1, Id id2, Id id3, const Permutation& permutation) { - // Get the internal data structures from the permutation. - auto& file = permutation._file; - const auto& meta = permutation._meta; - const auto& reader = permutation._reader; - - // Find the index of the first block where the last triple is not smaller. - // - // NOTE: With `_col2LastId` added to `CompressedBlockMetadata`, this can - // now be computed without having to decompress any blocks at this point. - // See the first revision of this branch for code, where blocks with equal - // `id1` and `id2` were decompressed to also check for `id3`. - const vector& blocks = meta.blockData(); - auto matchingBlock = std::lower_bound( - blocks.begin(), blocks.end(), std::array{id1, id2, id3}, - [&](const CompressedBlockMetadata& block, const auto& triple) -> bool { - if (block.col0LastId_ < triple[0]) { - return true; - } else if (block.col0LastId_ == triple[0]) { - if (block.col1LastId_ < triple[1]) { - return true; - } else if (block.col1LastId_ == triple[1]) { - return block.col2LastId_ < triple[2]; - } - } - return false; - }); - size_t blockIndex = matchingBlock - blocks.begin(); - - // Preliminary `FindTripleResult` object with the correct `blockIndex` and - // `Id`s, and a special `rowIndexInBlock` (see below) and `existsInIndex` set - // to `false`. - LocatedTriple locatedTriple{blockIndex, NO_ROW_INDEX, id1, id2, id3, false}; - - // If all `Id`s from all blocks are smaller, we return the index of the last - // block plus one (typical "end" semantics) and `NO_ROW_INDEX` (see above and - // how this is considered in `mergeTriples`). - if (matchingBlock == blocks.end()) { - AD_CORRECTNESS_CHECK(blockIndex == blocks.size()); - return locatedTriple; - } - - // Read and decompress the block. Note that we are potentially doing this - // a second time here (the block has probably already been looked at in - // the call to `std::lower_bound` above). - DecompressedBlock blockTuples = - reader.readAndDecompressBlock(*matchingBlock, file, std::nullopt); - - // Find the smallest "relation" ID that is not smaller than `id1` and get - // its metadata and the position of the first and last triple with that ID - // in the block. - // - // IMPORTANT FIX: If relation `id1` exists in the index, but our triple is - // larger than all triples of that relation in the index and the last - // triple of that relation ends a block, then our block search above - // (correctly) landed us at the next block. We can detect this by checking - // whether the first relation ID of the block is larger than `id1` and - // then we should get the metadata for the ID and not for `id1` (which - // would pertain to a previous block). - // - // TODO: There is still a bug in `MetaDataWrapperHashMap::lower_bound`, - // which is relevant in the rare case where a triple is inserted with an - // `Id` for predicate that is not a new `Id`, but has not been used for a - // predicate in the original index. - // - // NOTE: Since we have already handled the case, where all IDs in the - // permutation are smaller, above, such a relation should exist. - Id searchId = - matchingBlock->col0FirstId_ > id1 ? matchingBlock->col0FirstId_ : id1; - const auto& it = meta._data.lower_bound(searchId); - AD_CORRECTNESS_CHECK(it != meta._data.end()); - Id id = it.getId(); - const auto& relationMetadata = meta.getMetaData(id); - size_t offsetBegin = relationMetadata.offsetInBlock_; - size_t offsetEnd = offsetBegin + relationMetadata.numRows_; - // Note: If the relation spans multiple blocks, we know that the block we - // found above contains only triples from that relation. - if (offsetBegin == std::numeric_limits::max()) { - offsetBegin = 0; - offsetEnd = blockTuples.size(); - } - AD_CORRECTNESS_CHECK(offsetBegin <= blockTuples.size()); - AD_CORRECTNESS_CHECK(offsetEnd <= blockTuples.size()); - - // If we have found `id1`, we can do a binary search in the portion of the - // block that pertains to it (note the special case mentioned above, where - // we are already at the beginning of the next block). - // - // Otherwise, `id` is the next larger ID and the position of the first - // triple of that relation is exactly the position we are looking for. - if (id == id1) { - locatedTriple.rowIndexInBlock = - std::lower_bound(blockTuples.begin() + offsetBegin, - blockTuples.begin() + offsetEnd, - std::array{id2, id3}, - [](const auto& a, const auto& b) { - return a[0] < b[0] || (a[0] == b[0] && a[1] < b[1]); - }) - - blockTuples.begin(); - // Check if the triple at the found position is equal to `id1 id2 id3`. - // Note that our default for `existsInIndex` was set to `false` above. - const size_t& i = locatedTriple.rowIndexInBlock; - AD_CORRECTNESS_CHECK(i < blockTuples.size()); - if (i < offsetEnd && blockTuples(i, 0) == id2 && blockTuples(i, 1) == id3) { - locatedTriple.existsInIndex = true; - } - } else { - AD_CORRECTNESS_CHECK(id1 < id); - locatedTriple.rowIndexInBlock = offsetBegin; - } - - // Return the result. - return locatedTriple; -} - // ____________________________________________________________________________ std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt) { os << "LT(" << lt.blockIndex << " " diff --git a/src/index/LocatedTriples.h b/src/index/LocatedTriples.h index 8a313621fe..bb967bfe95 100644 --- a/src/index/LocatedTriples.h +++ b/src/index/LocatedTriples.h @@ -12,13 +12,18 @@ class Permutation; // A triple and its location in a particular permutation. // +// If a triple is not contained in the permutation, the location is the location +// of the next larger triple (which may be in the next block or beyond the last +// block). For a detailed definition of all border cases, see the definition at +// the end of this file. +// // NOTE: Technically, `blockIndex` and the `existsInIndex` are redundant in this -// record because they can be derived when the clas is used. However, both are -// useful for testing and for a small nuber of delta triples (think millions), -// the space efficiency of this class is not a significant issue. +// record because they can be derived when the class is used. However, they are +// useful for testing, and for a small nuber of delta triples (think millions), +// space efficiency is not a significant issue for this class. struct LocatedTriple { - // The index of the block and the position within that block, where the - // triple "fits". + // The index of the block and the location within that block, according to the + // definition above. size_t blockIndex; size_t rowIndexInBlock; // The `Id`s of the triple in the order of the permutation. For example, @@ -27,21 +32,28 @@ struct LocatedTriple { Id id1; Id id2; Id id3; - // True iff the triple exists in the permutation (then it is equal to the - // triple at the position given by `blockIndex` and `rowIndexInBlock`). + // Flag that is true if and only if the triple exists in the permutation. It + // is then equal to the triple at the position given by `blockIndex` and + // `rowIndexInBlock`. bool existsInIndex; // Locate the given triple in the given permutation. static LocatedTriple locateTripleInPermutation( Id id1, Id id2, Id id3, const Permutation& permutation); - // Special row index for triples that belong to previous block. It is - // important that this value plus one is actually greater. + // Special row index for triples that belong to the previous block (see the + // definition for the location of a triple at the end of this file). + // + // NOTE: It is important that `NO_ROW_INDEX + 1 > NO_ROW_INDEX`, hence it is + // defined as `max() - 1` and not as the seemingly more natural `max()`. static const size_t NO_ROW_INDEX = std::numeric_limits::max() - 1; }; -// A sorted set of triples located at the same position in a particular -// permutation. Note that we could also overload `std::less` here. +// A sorted set of located triples. In `LocatedTriplesPerBlock` below, we use +// this to store all located triples with the same `blockIndex`. +// +// NOTE: We could also overload `std::less` here, but the explicit specification +// of the order makes it clearer. struct LocatedTripleCompare { bool operator()(const LocatedTriple& x, const LocatedTriple& y) const { return IdTriple{x.id1, x.id2, x.id3} < IdTriple{y.id1, y.id2, y.id3}; @@ -49,43 +61,53 @@ struct LocatedTripleCompare { }; using LocatedTriples = std::set; -// A sorted set of triples located in particular permutation, grouped by block. +// Sorted sets of located triples, grouped by block. We use this to store all +// located triples for a permutation. class LocatedTriplesPerBlock { private: // The total number of `LocatedTriple` objects stored (for all blocks). size_t numTriples_ = 0; public: - // Map with the list of triples per block. + // For each block with a non-empty set of located triples, the located triples + // in that block. // - // TODO: Should be private. Should we make `LocatedTriplesPerBlock` a subclass - // of `HashMap` or is that bad style? + // NOTE: This is currently not private because we want access to + // `map_.size()`, `map_.clear()`, `map_.contains(...)`, and `map_.at(...)`. + // We could also make `LocatedTriplesPerBlock` a subclass of `HashMap`, but not sure whether that is good style. ad_utility::HashMap map_; public: - // Get the number of to-be-inserted (first) and to-be-deleted (second) triples - // for the given block and that match the `id1` (if provided) and `id2` (if - // provided). + // Get the number of located triples for the given block that match `id1` (if + // provided) and `id2` (if provided). The return value is a pair of numbers: + // first, the number of existing triples ("to be deleted") and second, the + // number of new triples ("to be inserted"). std::pair numTriples(size_t blockIndex) const; std::pair numTriples(size_t blockIndex, Id id1) const; std::pair numTriples(size_t blockIndex, Id id1, Id id2) const; - // Merge the located triples for `blockIndex` into the given `blockPart` and - // write the result to `result`, starting from position `offsetInResult`. If - // `blockPart` is a whole index block, `offsetInBlock` is zero, otherwise it's - // the offset in the full block, where the part starts. + // Merge located triples for `blockIndex` with the given index `block` and + // write to `result`, starting from position `offsetInResult`. Consider only + // located triples in the range specified by `rowIndexInBlockBegin` and + // `rowIndexInBlockEnd`. Consider only triples that match `id1` (if provided) + // and `id2` (if provided). Return the number of rows written to `result`. + // + // PRECONDITIONS: // - // It is the resposibility of the caller that there is enough space for the - // result starting from that offset. Like for `numTriplesInBlock` above, - // consider only triples that match `id1` (if provided) and `id2` (if - // provided). + // 1. The set of located triples for `blockIndex` must be non-empty. + // Otherwise, there is no need for merging and this method shouldn't be + // called for efficiency reasons. // - // In the special case where `block == std::nullopt`, we are just inserting - // the located triples for block `blockIndex` where the `rowIndexInBlock` is + // 2. It is the resposibility of the caller that there is enough space for the + // result of the merge in `result` starting from `offsetInResult`. + // + // 3. If `block == std::nullopt`, we are adding to `result` the located + // triples for block `blockIndex` where the `rowIndexInBlock` is // `NO_ROW_INDEX`. These actually belong to the previous block, but were - // larger than all triples there. + // larger than all triples there. This requires that `id1` or both `id1` and + // `id2` are specified. // - // Returns the number of rows written to `result`. size_t mergeTriples(size_t blockIndex, std::optional block, IdTable& result, size_t offsetInResult) const; size_t mergeTriples(size_t blockIndex, std::optional block, @@ -97,8 +119,11 @@ class LocatedTriplesPerBlock { size_t rowIndexInBlockEnd = LocatedTriple::NO_ROW_INDEX) const; // Add the given `locatedTriple` to the given `LocatedTriplesPerBlock`. - // Returns a handle to where it was added (via which we can easily remove it - // again if we need to). + // Return a handle to where it was added (`LocatedTriples` is a sorted set, + // see above). We need this handle so that we can easily remove the + // `locatedTriple` again from the set in case we need to. + // + // The `locatedTriple` must not already exist in `LocatedTriplesPerBlock`. LocatedTriples::iterator add(const LocatedTriple& locatedTriple) { LocatedTriples& locatedTriples = map_[locatedTriple.blockIndex]; auto [handle, wasInserted] = locatedTriples.emplace(locatedTriple); @@ -111,10 +136,10 @@ class LocatedTriplesPerBlock { // Get the total number of `LocatedTriple` objects (for all blocks). size_t numTriples() const { return numTriples_; } - // Get the number of blocks containing `LocatedTriple` objects. + // Get the number of blocks with a non-empty set of located triples. size_t numBlocks() const { return map_.size(); } - // Empty the data structure. + // Remove all located triples. void clear() { map_.clear(); numTriples_ = 0; @@ -142,7 +167,30 @@ class LocatedTriplesPerBlock { }; // Human-readable representation of `LocatedTriple`, `LocatedTriples`, and -// `LocatedTriplesPerBlock` that are very useful for debugging. +// `LocatedTriplesPerBlock`, which are very useful for debugging. std::ostream& operator<<(std::ostream& os, const LocatedTriple& lt); std::ostream& operator<<(std::ostream& os, const LocatedTriples& lts); std::ostream& operator<<(std::ostream& os, const LocatedTriplesPerBlock& ltpb); + +// DEFINITION OF THE POSITION OF A LOCATED TRIPLE IN A PERMUTATION +// +// 1. The position is defined by the index of a block in the permutation and the +// index of a row within that block. +// +// 2. If the triple in contained in the permutation, it is contained exactly +// once and so there is a well defined block and position in that block. +// +// 2. If there is a block, where the first triple is smaller and the last triple +// is larger, then that is the block and the position in that block is that of +// the first triple that is (not smaller and hence) larger. +// +// 3. If the triple falls "between two blocks" (the last triple of the previous +// block is smaller and the first triple of the next block is larger), then the +// position is the first position in that next block. +// +// 4. As a special case of 3, if the triple is smaller than all triples in the +// permutation, the position is the first position of the first block. +// +// 5. If the triple is larger than all triples in the permutation, the block +// index is one after the largest block index and the position within that +// non-existing block is arbitrary. diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 15fe0fb089..822e7c4b0b 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -16,7 +16,7 @@ namespace { auto V = ad_utility::testing::VocabId; } -// Fixture that ... TODO:explain. +// Fixture with helper functions. class LocatedTriplesTest : public ::testing::Test { protected: // Make `LocatedTriplesPerBlock` from a list of `LocatedTriple` objects (the