Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Smaller block size and add _col2LastId to block metadata #917

Merged
merged 9 commits into from
Apr 4, 2023
4 changes: 3 additions & 1 deletion src/index/CompressedRelation.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ void CompressedRelationWriter::addRelation(Id col0Id,
}
_currentBlockData._col0LastId = col0Id;
_currentBlockData._col1LastId = col1And2Ids(col1And2Ids.numRows() - 1, 0);
_currentBlockData._col2LastId = col1And2Ids(col1And2Ids.numRows() - 1, 1);
AD_CORRECTNESS_CHECK(_buffer.numColumns() == col1And2Ids.numColumns());
auto bufferOldSize = _buffer.numRows();
_buffer.resize(_buffer.numRows() + col1And2Ids.numRows());
Expand Down Expand Up @@ -396,7 +397,8 @@ void CompressedRelationWriter::writeRelationToExclusiveBlocks(

_blockBuffer.push_back(CompressedBlockMetadata{
std::move(offsets), actualNumRowsPerBlock, col0Id, col0Id, data[i][0],
data[i + actualNumRowsPerBlock - 1][0]});
data[i + actualNumRowsPerBlock - 1][0],
data[i + actualNumRowsPerBlock - 1][1]});
}
}

Expand Down
12 changes: 12 additions & 0 deletions src/index/CompressedRelation.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,11 +60,21 @@ struct CompressedBlockMetadata {
// For example, in the PSO permutation, col0 is the P and col1 is the S. The
// col0 ID is not stored in the block. First and last are meant inclusively,
// that is, they are both part of the block.
//
// NOTE: Strictly speaking, we don't need `_col0FirstId` and `_col1FirstId`.
// However, they are convenient to have and don't really harm with respect to
// space efficiency. For example, for Wikidata, we have only around 50K blocks
// with block size 8M and around 5M blocks with block size 80K; even the
// latter takes only half a GB in total.
Id _col0FirstId;
Id _col0LastId;
Id _col1FirstId;
Id _col1LastId;

// For our `DeltaTriples` (https://github.com/ad-freiburg/qlever/pull/916), we
// need to know the least significant `Id` of the last triple as well.
Id _col2LastId;

// Two of these are equal if all members are equal.
bool operator==(const CompressedBlockMetadata&) const = default;
};
Expand All @@ -83,6 +93,7 @@ AD_SERIALIZE_FUNCTION(CompressedBlockMetadata) {
serializer | arg._col0LastId;
serializer | arg._col1FirstId;
serializer | arg._col1LastId;
serializer | arg._col2LastId;
}

// The metadata of a whole compressed "relation", where relation refers to a
Expand Down Expand Up @@ -304,6 +315,7 @@ class CompressedRelationReader {
static void decompressColumn(const std::vector<char>& compressedColumn,
size_t numRowsToRead, Iterator iterator);

public:
// Read the block that is identified by the `blockMetaData` from the `file`,
// decompress and return it.
// If `columnIndices` is `nullopt`, then all columns of the block are read,
Expand Down
10 changes: 7 additions & 3 deletions src/index/ConstantsIndexBuilding.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ constexpr size_t QUEUE_SIZE_BEFORE_PARALLEL_PARSING = 10;
// time
constexpr size_t QUEUE_SIZE_AFTER_PARALLEL_PARSING = 10;

// The uncompressed size in bytes of a block of the permutations. Currently 8MB
// is chosen which is well suited for zstd compression
constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 1ul << 23u;
// The uncompressed size in bytes of a block of the permutations.
//
// NOTE: This used to be `1 << 23` (over 8M), which is fairly large (we always
// need to decompress at least one whole block, even when reading only few
// triples). With 100K, the total space for all the `CompressedBlockMetadata` is
// still small compared to the rest of the index.
constexpr size_t BLOCKSIZE_COMPRESSED_METADATA = 100'000;
10 changes: 5 additions & 5 deletions test/IndexMetaDataTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ auto V = ad_utility::testing::VocabId;

TEST(RelationMetaDataTest, writeReadTest) {
CompressedBlockMetadata rmdB{
{{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24)};
{{12, 34}, {46, 11}}, 5, V(0), V(2), V(13), V(24), V(62)};
CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};

ad_utility::serialization::FileWriteSerializer f("_testtmp.rmd");
Expand All @@ -39,9 +39,9 @@ TEST(RelationMetaDataTest, writeReadTest) {
TEST(IndexMetaDataTest, writeReadTest2Hmap) {
vector<CompressedBlockMetadata> bs;
bs.push_back(CompressedBlockMetadata{
{{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24)});
{{12, 34}, {42, 5}}, 5, V(0), V(2), V(13), V(24), V(62)});
bs.push_back(CompressedBlockMetadata{
{{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24)});
{{16, 34}, {165, 3}}, 5, V(0), V(2), V(13), V(24), V(62)});
CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};
CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10};
IndexMetaDataHmap imd;
Expand Down Expand Up @@ -71,9 +71,9 @@ TEST(IndexMetaDataTest, writeReadTest2Mmap) {
std::string mmapFilename = imdFilename + ".mmap";
vector<CompressedBlockMetadata> bs;
bs.push_back(CompressedBlockMetadata{
{{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24)});
{{12, 34}, {42, 17}}, 5, V(0), V(2), V(13), V(24), V(62)});
bs.push_back(CompressedBlockMetadata{
{{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24)});
{{12, 34}, {16, 12}}, 5, V(0), V(2), V(13), V(24), V(62)});
CompressedRelationMetadata rmdF{V(1), 3, 2.0, 42.0, 16};
CompressedRelationMetadata rmdF2{V(2), 5, 3.0, 43.0, 10};
// The index MetaData does not have an explicit clear, so we
Expand Down
9 changes: 5 additions & 4 deletions test/IndexTestHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ inline Index makeIndexWithTestSettings() {
// when the files were not deleted after the test).
inline std::vector<std::string> getAllIndexFilenames(
const std::string indexBasename) {
return {indexBasename + ".index.pos",
return {indexBasename + ".ttl",
indexBasename + ".index.pos",
indexBasename + ".index.pso",
indexBasename + ".index.sop",
indexBasename + ".index.sop.meta",
Expand Down Expand Up @@ -64,7 +65,7 @@ inline Index makeTestIndex(const std::string& indexBasename,
// these tests.
static std::ostringstream ignoreLogStream;
ad_utility::setGlobalLoggingStream(&ignoreLogStream);
std::string filename = "relationalExpressionTestIndex.ttl";
std::string inputFilename = indexBasename + ".ttl";
if (turtleInput.empty()) {
turtleInput =
"<x> <label> \"alpha\" . <x> <label> \"älpha\" . <x> <label> \"A\" . "
Expand All @@ -74,14 +75,14 @@ inline Index makeTestIndex(const std::string& indexBasename,
}

FILE_BUFFER_SIZE() = 1000;
std::fstream f(filename, std::ios_base::out);
std::fstream f(inputFilename, std::ios_base::out);
f << turtleInput;
f.close();
{
Index index = makeIndexWithTestSettings();
index.setOnDiskBase(indexBasename);
index.setUsePatterns(true);
index.createFromFile<TurtleParserAuto>(filename);
index.createFromFile<TurtleParserAuto>(inputFilename);
}
Index index;
index.setUsePatterns(true);
Expand Down