From 2bbdc2b0fe79b6f502c63202f1de1e84cf6038ec Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 19 Mar 2023 20:47:11 +0100 Subject: [PATCH 1/7] Smaller block size and add _col2LastId to block metadata 1. The block size used to be `1 << 23` (over 8M), which is too large, since we always need to decompress at least one whole block, even when reading only few triples. It's now 100'000, which still has a small relatively small overall space consumption. 2. Add member `_col2LastId` to block data because we need it for the delta triples (https://github.com/ad-freiburg/qlever/pull/916). --- src/index/CompressedRelation.cpp | 4 +++- src/index/CompressedRelation.h | 12 ++++++++++++ src/index/ConstantsIndexBuilding.h | 10 +++++++--- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 4dee23172f..a03720af10 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -367,6 +367,7 @@ void CompressedRelationWriter::addRelation(Id col0Id, } _currentBlockData._col0LastId = col0Id; _currentBlockData._col1LastId = col1And2Ids(col1And2Ids.numRows() - 1, 0); + _currentBlockData._col2LastId = col1And2Ids(col1And2Ids.numRows() - 1, 1); AD_CORRECTNESS_CHECK(_buffer.numColumns() == col1And2Ids.numColumns()); auto bufferOldSize = _buffer.numRows(); _buffer.resize(_buffer.numRows() + col1And2Ids.numRows()); @@ -396,7 +397,8 @@ void CompressedRelationWriter::writeRelationToExclusiveBlocks( _blockBuffer.push_back(CompressedBlockMetadata{ std::move(offsets), actualNumRowsPerBlock, col0Id, col0Id, data[i][0], - data[i + actualNumRowsPerBlock - 1][0]}); + data[i + actualNumRowsPerBlock - 1][0], + data[i + actualNumRowsPerBlock - 1][1]}); } } diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 86ba5931af..4b86aabbff 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -60,11 +60,21 @@ struct CompressedBlockMetadata { // For example, in the PSO permutation, col0 is the P and col1 is the S. The // col0 ID is not stored in the block. First and last are meant inclusively, // that is, they are both part of the block. + // + // NOTE: Strictly speaking, we don't need `_col0FirstId` and `_col1FirstId`. + // However, they are convenient to have and don't really harm with respect to + // space efficiency. For example, for Wikidata, we have only around 50K blocks + // with block size 8M and around 5M blocks with block size 80K; even the + // latter takes only half a GB in total. Id _col0FirstId; Id _col0LastId; Id _col1FirstId; Id _col1LastId; + // For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we + // need to know the least significant `Id` of the last triple as well. + Id _col2LastId; + // Two of these are equal if all members are equal. bool operator==(const CompressedBlockMetadata&) const = default; }; @@ -83,6 +93,7 @@ AD_SERIALIZE_FUNCTION(CompressedBlockMetadata) { serializer | arg._col0LastId; serializer | arg._col1FirstId; serializer | arg._col1LastId; + serializer | arg._col2LastId; } // The metadata of a whole compressed "relation", where relation refers to a @@ -304,6 +315,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 6d73100e89..dccf095f2a 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -79,6 +79,10 @@ constexpr size_t QUEUE_SIZE_BEFORE_PARALLEL_PARSING = 10; // time constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; -// The uncompressed size in bytes of a block of the permutations. Currently 8MB -// is chosen which is well suited for zstd compression -constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; +// The uncompressed size in bytes of a block of the permutations. +// +// NOTE: This used to be `1 << 23` (over 8M), which is fairly large (we always +// need to decompress at least one whole block, even when reading only few +// triples). With 100K, the total space for all the `CompressedBlockMetadata` is +// still small compared to the rest of the index. +constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 100'000; From f5749f2fb6c93912d015f5c283b4ff2e7f5c02ab Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 19 Mar 2023 21:11:39 +0100 Subject: [PATCH 2/7] Add missing adjustments in tests 1. The test in `IndexMetaDataTest` needs to be adapted to the addition of `_col2LastId`. 2. Unrelated fix in `IndexTestHelpers.h`: The test TTL file was not deleted after the test, now it is. --- test/IndexMetaDataTest.cpp | 10 +++++----- test/IndexTestHelpers.h | 9 +++++---- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/test/IndexMetaDataTest.cpp b/test/IndexMetaDataTest.cpp index 5c5b5af654..f1af1e04e3 100644 --- a/test/IndexMetaDataTest.cpp +++ b/test/IndexMetaDataTest.cpp @@ -17,7 +17,7 @@ auto V = ad_utility::testing::VocabId; TEST(RelationMetaDataTest, writeReadTest) { CompressedBlockMetadata rmdB{ - {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24)}; + {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24), V(62)}; CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; ad_utility::serialization::FileWriteSerializer f("_testtmp.rmd"); @@ -39,9 +39,9 @@ TEST(RelationMetaDataTest, writeReadTest) { TEST(IndexMetaDataTest, writeReadTest2Hmap) { vector bs; bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24), V(62)}); bs.push_back(CompressedBlockMetadata{ - {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24)}); + {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24), V(62)}); CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10}; IndexMetaDataHmap imd; @@ -71,9 +71,9 @@ TEST(IndexMetaDataTest, writeReadTest2Mmap) { std::string mmapFilename = imdFilename + ".mmap"; vector bs; bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24), V(62)}); bs.push_back(CompressedBlockMetadata{ - {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24)}); + {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24), V(62)}); CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16}; CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10}; // The index MetaData does not have an explicit clear, so we diff --git a/test/IndexTestHelpers.h b/test/IndexTestHelpers.h index 72a86d64ee..5ae4638861 100644 --- a/test/IndexTestHelpers.h +++ b/test/IndexTestHelpers.h @@ -34,7 +34,8 @@ inline Index makeIndexWithTestSettings() { // when the files were not deleted after the test). inline std::vector getAllIndexFilenames( const std::string indexBasename) { - return {indexBasename + ".index.pos", + return {indexBasename + ".ttl", + indexBasename + ".index.pos", indexBasename + ".index.pso", indexBasename + ".index.sop", indexBasename + ".index.sop.meta", @@ -64,7 +65,7 @@ inline Index makeTestIndex(const std::string& indexBasename, // these tests. static std::ostringstream ignoreLogStream; ad_utility::setGlobalLoggingStream(&ignoreLogStream); - std::string filename = "relationalExpressionTestIndex.ttl"; + std::string inputFilename = indexBasename + ".ttl"; if (turtleInput.empty()) { turtleInput = "