From 2bbdc2b0fe79b6f502c63202f1de1e84cf6038ec Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Sun, 19 Mar 2023 20:47:11 +0100 Subject: [PATCH] Smaller block size and add _col2LastId to block metadata 1. The block size used to be `1 << 23` (over 8M), which is too large, since we always need to decompress at least one whole block, even when reading only few triples. It's now 100'000, which still has a small relatively small overall space consumption. 2. Add member `_col2LastId` to block data because we need it for the delta triples (https://github.com/ad-freiburg/qlever/pull/916). --- src/index/CompressedRelation.cpp | 4 +++- src/index/CompressedRelation.h | 12 ++++++++++++ src/index/ConstantsIndexBuilding.h | 10 +++++++--- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 4dee23172f..a03720af10 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -367,6 +367,7 @@ void CompressedRelationWriter::addRelation(Id col0Id, } _currentBlockData._col0LastId = col0Id; _currentBlockData._col1LastId = col1And2Ids(col1And2Ids.numRows() - 1, 0); + _currentBlockData._col2LastId = col1And2Ids(col1And2Ids.numRows() - 1, 1); AD_CORRECTNESS_CHECK(_buffer.numColumns() == col1And2Ids.numColumns()); auto bufferOldSize = _buffer.numRows(); _buffer.resize(_buffer.numRows() + col1And2Ids.numRows()); @@ -396,7 +397,8 @@ void CompressedRelationWriter::writeRelationToExclusiveBlocks( _blockBuffer.push_back(CompressedBlockMetadata{ std::move(offsets), actualNumRowsPerBlock, col0Id, col0Id, data[i][0], - data[i + actualNumRowsPerBlock - 1][0]}); + data[i + actualNumRowsPerBlock - 1][0], + data[i + actualNumRowsPerBlock - 1][1]}); } } diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 86ba5931af..4b86aabbff 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -60,11 +60,21 @@ struct CompressedBlockMetadata { // For example, in the PSO permutation, col0 is the P and col1 is the S. The // col0 ID is not stored in the block. First and last are meant inclusively, // that is, they are both part of the block. + // + // NOTE: Strictly speaking, we don't need `_col0FirstId` and `_col1FirstId`. + // However, they are convenient to have and don't really harm with respect to + // space efficiency. For example, for Wikidata, we have only around 50K blocks + // with block size 8M and around 5M blocks with block size 80K; even the + // latter takes only half a GB in total. Id _col0FirstId; Id _col0LastId; Id _col1FirstId; Id _col1LastId; + // For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we + // need to know the least significant `Id` of the last triple as well. + Id _col2LastId; + // Two of these are equal if all members are equal. bool operator==(const CompressedBlockMetadata&) const = default; }; @@ -83,6 +93,7 @@ AD_SERIALIZE_FUNCTION(CompressedBlockMetadata) { serializer | arg._col0LastId; serializer | arg._col1FirstId; serializer | arg._col1LastId; + serializer | arg._col2LastId; } // The metadata of a whole compressed "relation", where relation refers to a @@ -304,6 +315,7 @@ class CompressedRelationReader { static void decompressColumn(const std::vector& compressedColumn, size_t numRowsToRead, Iterator iterator); + public: // Read the block that is identified by the `blockMetaData` from the `file`, // decompress and return it. // If `columnIndices` is `nullopt`, then all columns of the block are read, diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h index 6d73100e89..dccf095f2a 100644 --- a/src/index/ConstantsIndexBuilding.h +++ b/src/index/ConstantsIndexBuilding.h @@ -79,6 +79,10 @@ constexpr size_t QUEUE_SIZE_BEFORE_PARALLEL_PARSING = 10; // time constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10; -// The uncompressed size in bytes of a block of the permutations. Currently 8MB -// is chosen which is well suited for zstd compression -constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u; +// The uncompressed size in bytes of a block of the permutations. +// +// NOTE: This used to be `1 << 23` (over 8M), which is fairly large (we always +// need to decompress at least one whole block, even when reading only few +// triples). With 100K, the total space for all the `CompressedBlockMetadata` is +// still small compared to the rest of the index. +constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 100'000;