ad-freiburg · joka921 · Apr 4, 2023 · Mar 19, 2023 · Mar 19, 2023 · Mar 21, 2023
diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp
@@ -367,6 +367,7 @@ void CompressedRelationWriter::addRelation(Id col0Id,
     }
     _currentBlockData._col0LastId = col0Id;
     _currentBlockData._col1LastId = col1And2Ids(col1And2Ids.numRows() - 1, 0);
+    _currentBlockData._col2LastId = col1And2Ids(col1And2Ids.numRows() - 1, 1);
     AD_CORRECTNESS_CHECK(_buffer.numColumns() == col1And2Ids.numColumns());
     auto bufferOldSize = _buffer.numRows();
     _buffer.resize(_buffer.numRows() + col1And2Ids.numRows());
@@ -396,7 +397,8 @@ void CompressedRelationWriter::writeRelationToExclusiveBlocks(
 
     _blockBuffer.push_back(CompressedBlockMetadata{
         std::move(offsets), actualNumRowsPerBlock, col0Id, col0Id, data[i][0],
-        data[i + actualNumRowsPerBlock - 1][0]});
+        data[i + actualNumRowsPerBlock - 1][0],
+        data[i + actualNumRowsPerBlock - 1][1]});
   }
 }
 

diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h
@@ -60,11 +60,21 @@ struct CompressedBlockMetadata {
   // For example, in the PSO permutation, col0 is the P and col1 is the S. The
   // col0 ID is not stored in the block. First and last are meant inclusively,
   // that is, they are both part of the block.
+  //
+  // NOTE: Strictly speaking, we don't need `_col0FirstId` and `_col1FirstId`.
+  // However, they are convenient to have and don't really harm with respect to
+  // space efficiency. For example, for Wikidata, we have only around 50K blocks
+  // with block size 8M and around 5M blocks with block size 80K; even the
+  // latter takes only half a GB in total.
   Id _col0FirstId;
   Id _col0LastId;
   Id _col1FirstId;
   Id _col1LastId;
 
+  // For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we
+  // need to know the least significant `Id` of the last triple as well.
+  Id _col2LastId;
+
   // Two of these are equal if all members are equal.
   bool operator==(const CompressedBlockMetadata&) const = default;
 };
@@ -83,6 +93,7 @@ AD_SERIALIZE_FUNCTION(CompressedBlockMetadata) {
   serializer | arg._col0LastId;
   serializer | arg._col1FirstId;
   serializer | arg._col1LastId;
+  serializer | arg._col2LastId;
 }
 
 // The metadata of a whole compressed "relation", where relation refers to a
@@ -304,6 +315,7 @@ class CompressedRelationReader {
   static void decompressColumn(const std::vector<char>& compressedColumn,
                                size_t numRowsToRead, Iterator iterator);
 
+ public:
   // Read the block that is identified by the `blockMetaData` from the `file`,
   // decompress and return it.
   // If `columnIndices` is `nullopt`, then all columns of the block are read,

diff --git a/src/index/ConstantsIndexBuilding.h b/src/index/ConstantsIndexBuilding.h
@@ -79,6 +79,10 @@ constexpr size_t QUEUE_SIZE_BEFORE_PARALLEL_PARSING = 10;
 // time
 constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10;
 
-//  The uncompressed size in bytes of a block of the permutations. Currently 8MB
-//   is chosen which is well suited for zstd compression
-constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u;
+// The uncompressed size in bytes of a block of the permutations.
+//
+// NOTE: This used to be `1 << 23` (over 8M), which is fairly large (we always
+// need to decompress at least one whole block, even when reading only few
+// triples). With 100K, the total space for all the `CompressedBlockMetadata` is
+// still small compared to the rest of the index.
+constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 100'000;
diff --git a/test/IndexMetaDataTest.cpp b/test/IndexMetaDataTest.cpp
@@ -17,7 +17,7 @@ auto V = ad_utility::testing::VocabId;
 
 TEST(RelationMetaDataTest, writeReadTest) {
   CompressedBlockMetadata rmdB{
-      {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24)};
+      {{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24), V(62)};
   CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};
 
   ad_utility::serialization::FileWriteSerializer f("_testtmp.rmd");
@@ -39,9 +39,9 @@ TEST(RelationMetaDataTest, writeReadTest) {
 TEST(IndexMetaDataTest, writeReadTest2Hmap) {
   vector<CompressedBlockMetadata> bs;
   bs.push_back(CompressedBlockMetadata{
-      {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24)});
+      {{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24), V(62)});
   bs.push_back(CompressedBlockMetadata{
-      {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24)});
+      {{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24), V(62)});
   CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};
   CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10};
   IndexMetaDataHmap imd;
@@ -71,9 +71,9 @@ TEST(IndexMetaDataTest, writeReadTest2Mmap) {
   std::string mmapFilename = imdFilename + ".mmap";
   vector<CompressedBlockMetadata> bs;
   bs.push_back(CompressedBlockMetadata{
-      {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24)});
+      {{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24), V(62)});
   bs.push_back(CompressedBlockMetadata{
-      {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24)});
+      {{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24), V(62)});
   CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};
   CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10};
   // The index MetaData does not have an explicit clear, so we

diff --git a/test/IndexTestHelpers.h b/test/IndexTestHelpers.h
@@ -34,7 +34,8 @@ inline Index makeIndexWithTestSettings() {
 // when the files were not deleted after the test).
 inline std::vector<std::string> getAllIndexFilenames(
     const std::string indexBasename) {
-  return {indexBasename + ".index.pos",
+  return {indexBasename + ".ttl",
+          indexBasename + ".index.pos",
           indexBasename + ".index.pso",
           indexBasename + ".index.sop",
           indexBasename + ".index.sop.meta",
@@ -64,7 +65,7 @@ inline Index makeTestIndex(const std::string& indexBasename,
   // these tests.
   static std::ostringstream ignoreLogStream;
   ad_utility::setGlobalLoggingStream(&ignoreLogStream);
-  std::string filename = "relationalExpressionTestIndex.ttl";
+  std::string inputFilename = indexBasename + ".ttl";
   if (turtleInput.empty()) {
     turtleInput =
         "<x> <label> \"alpha\" . <x> <label> \"älpha\" . <x> <label> \"A\" . "
@@ -74,14 +75,14 @@ inline Index makeTestIndex(const std::string& indexBasename,
   }
 
   FILE_BUFFER_SIZE() = 1000;
-  std::fstream f(filename, std::ios_base::out);
+  std::fstream f(inputFilename, std::ios_base::out);
   f << turtleInput;
   f.close();
   {
     Index index = makeIndexWithTestSettings();
     index.setOnDiskBase(indexBasename);
     index.setUsePatterns(true);
-    index.createFromFile<TurtleParserAuto>(filename);
+    index.createFromFile<TurtleParserAuto>(inputFilename);
   }
   Index index;
   index.setUsePatterns(true);