From b65375320612625fd1a7325878174a1e2d621eda Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Thu, 3 Aug 2023 12:27:01 +0200 Subject: [PATCH] fix: no longer have regression when no bitmap flipped is most efficient --- endToEndTests/test/info.test.js | 24 +++++++------- src/silo/database.test.cpp | 12 +++---- src/silo/storage/aa_store.cpp | 51 ++++++----------------------- src/silo/storage/sequence_store.cpp | 44 ++++++------------------- src/silo_api/info_handler.cpp | 2 +- 5 files changed, 39 insertions(+), 94 deletions(-) diff --git a/endToEndTests/test/info.test.js b/endToEndTests/test/info.test.js index 302ba175f..aec21d8c5 100644 --- a/endToEndTests/test/info.test.js +++ b/endToEndTests/test/info.test.js @@ -7,7 +7,7 @@ describe('The /info endpoint', () => { .get('/info') .expect(200) .expect('Content-Type', 'application/json') - .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60074145 }) + .expect({ nBitmapsSize: 3898, sequenceCount: 100, totalSize: 60055044 }) .end(done); }); @@ -26,15 +26,15 @@ describe('The /info endpoint', () => { 'bitmapContainerSizeStatistic' ); expect(returnedInfo.bitmapContainerSizePerGenomeSection.bitmapContainerSizeStatistic).to.deep.equal({ - numberOfArrayContainers: 47970, - numberOfBitsetContainers: 209, - numberOfRunContainers: 209, - numberOfValuesStoredInArrayContainers: 64283, + numberOfArrayContainers: 43545, + numberOfBitsetContainers: 0, + numberOfRunContainers: 78, + numberOfValuesStoredInArrayContainers: 59694, numberOfValuesStoredInBitsetContainers: 0, - numberOfValuesStoredInRunContainers: 2410, - totalBitmapSizeArrayContainers: 128566, + numberOfValuesStoredInRunContainers: 2237, + totalBitmapSizeArrayContainers: 119388, totalBitmapSizeBitsetContainers: 0, - totalBitmapSizeRunContainers: 3538, + totalBitmapSizeRunContainers: 2964, }); expect(returnedInfo.bitmapContainerSizePerGenomeSection).to.have.property( @@ -62,18 +62,18 @@ describe('The /info endpoint', () => { expect(returnedInfo).to.have.property('bitmapSizePerSymbol'); expect(returnedInfo.bitmapSizePerSymbol).to.deep.equal({ '-': 6003470, - 'A': 6127203, + 'A': 6112681, 'B': 5980600, - 'C': 6073069, + 'C': 6064603, 'D': 5980600, - 'G': 6075909, + 'G': 6067693, 'H': 5980600, 'K': 5980630, 'M': 5980620, 'N': 5980600, 'R': 5980620, 'S': 5980600, - 'T': 6139332, + 'T': 6125253, 'V': 5980600, 'W': 5980600, 'Y': 5980620, diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index e5c669f23..8e692e733 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -40,7 +40,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { const auto simple_info = database.getDatabaseInfo(); EXPECT_EQ( - detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6127203 + detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::A), 6112681 ); EXPECT_EQ( detailed_info.bitmap_size_per_symbol.size_in_bytes.at(silo::NUCLEOTIDE_SYMBOL::GAP), 6003470 @@ -54,7 +54,7 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .number_of_values_stored_in_run_containers, - 2410 + 2237 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic @@ -63,18 +63,18 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96205673 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_computed, 96160390 ); EXPECT_EQ( - detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48217381 + detailed_info.bitmap_container_size_per_genome_section.total_bitmap_size_frozen, 48185111 ); EXPECT_EQ( detailed_info.bitmap_container_size_per_genome_section.bitmap_container_size_statistic .total_bitmap_size_array_containers, - 128566 + 119388 ); - EXPECT_EQ(simple_info.total_size, 60074145); + EXPECT_EQ(simple_info.total_size, 60055044); EXPECT_EQ(simple_info.sequence_count, 100); EXPECT_EQ(simple_info.n_bitmaps_size, 3898); } diff --git a/src/silo/storage/aa_store.cpp b/src/silo/storage/aa_store.cpp index ba27183ae..5cf23bd2e 100644 --- a/src/silo/storage/aa_store.cpp +++ b/src/silo/storage/aa_store.cpp @@ -29,6 +29,7 @@ void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { for (const auto& symbol : AA_SYMBOLS) { roaring::Roaring bitmap = bitmaps.at(symbol); bitmap.runOptimize(); + bitmap.shrinkToFit(); const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { @@ -36,13 +37,18 @@ void silo::AAPosition::flipMostNumerousBitmap(uint32_t sequence_count) { max_count = count; } } - if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + bitmaps[*flipped_bitmap_before].runOptimize(); + bitmaps[*flipped_bitmap_before].shrinkToFit(); + } + if (max_symbol.has_value()) { + bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); + bitmaps[*max_symbol].shrinkToFit(); } symbol_whose_bitmap_is_flipped = max_symbol; - bitmaps[*max_symbol].flip(0, sequence_count); - bitmaps[*max_symbol].runOptimize(); } } @@ -148,6 +154,7 @@ void silo::AAStorePartition::fillXBitmaps(const std::vector& sequen positions_with_aa_symbol_x.size(), positions_with_aa_symbol_x.data() ); aa_symbol_x_bitmaps[sequence_count + sequence_id].runOptimize(); + aa_symbol_x_bitmaps[sequence_count + sequence_id].shrinkToFit(); positions_with_aa_symbol_x.clear(); } } @@ -160,44 +167,6 @@ void silo::AAStorePartition::interpret(const std::vector& sequences sequence_count += sequences.size(); } -size_t silo::AAStorePartition::computeSize() const { - size_t result = 0; - for (const auto& position : positions) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - result += position.bitmaps.at(symbol).getSizeInBytes(false); - } - } - return result; -} - -size_t silo::AAStorePartition::runOptimize() { - std::atomic count_true = 0; - tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { - for (auto position = local.begin(); position != local.end(); ++position) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - if (positions[position].bitmaps[symbol].runOptimize()) { - ++count_true; - } - } - } - }); - return count_true; -} - -size_t silo::AAStorePartition::shrinkToFit() { - std::atomic saved = 0; - tbb::parallel_for(tbb::blocked_range(0U, positions.size()), [&](const auto& local) { - size_t local_saved = 0; - for (auto position = local.begin(); position != local.end(); ++position) { - for (const AA_SYMBOL symbol : AA_SYMBOLS) { - local_saved += positions[position].bitmaps[symbol].shrinkToFit(); - } - } - saved += local_saved; - }); - return saved; -} - silo::AAStore::AAStore(std::vector reference_sequence) : reference_sequence(std::move(reference_sequence)) {} diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index c7e20e849..9c5befd9b 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -29,8 +29,9 @@ void silo::NucPosition::flipMostNumerousBitmap(uint32_t sequence_count) { uint32_t max_count = 0; for (const auto& symbol : NUC_SYMBOLS) { - roaring::Roaring bitmap = bitmaps.at(symbol); + roaring::Roaring& bitmap = bitmaps[symbol]; bitmap.runOptimize(); + bitmap.shrinkToFit(); const uint32_t count = flipped_bitmap_before == symbol ? sequence_count - bitmap.cardinality() : bitmap.cardinality(); if (count > max_count) { @@ -38,13 +39,18 @@ void silo::NucPosition::flipMostNumerousBitmap(uint32_t sequence_count) { max_count = count; } } - if (max_symbol.has_value() && max_symbol != flipped_bitmap_before) { + if (max_symbol != flipped_bitmap_before) { if (flipped_bitmap_before.has_value()) { bitmaps[*flipped_bitmap_before].flip(0, sequence_count); + bitmaps[*flipped_bitmap_before].runOptimize(); + bitmaps[*flipped_bitmap_before].shrinkToFit(); + } + if (max_symbol.has_value()) { + bitmaps[*max_symbol].flip(0, sequence_count); + bitmaps[*max_symbol].runOptimize(); + bitmaps[*max_symbol].shrinkToFit(); } symbol_whose_bitmap_is_flipped = max_symbol; - bitmaps[*max_symbol].flip(0, sequence_count); - bitmaps[*max_symbol].runOptimize(); } } @@ -199,36 +205,6 @@ size_t silo::SequenceStorePartition::computeSize() const { return result; } -size_t silo::SequenceStorePartition::runOptimize() { - std::atomic count_true = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { - for (auto position = local.begin(); position != local.end(); ++position) { - for (const NUCLEOTIDE_SYMBOL symbol : NUC_SYMBOLS) { - if (positions[position].bitmaps[symbol].runOptimize()) { - ++count_true; - } - } - } - }); - return count_true; -} - -size_t silo::SequenceStorePartition::shrinkToFit() { - std::atomic saved = 0; - const tbb::blocked_range range(0U, positions.size()); - tbb::parallel_for(range, [&](const decltype(range) local) { - size_t local_saved = 0; - for (auto position = local.begin(); position != local.end(); ++position) { - for (const NUCLEOTIDE_SYMBOL symbol : NUC_SYMBOLS) { - local_saved += positions[position].bitmaps[symbol].shrinkToFit(); - } - } - saved += local_saved; - }); - return saved; -} - silo::SequenceStore::SequenceStore(std::vector reference_genome) : reference_genome(std::move(reference_genome)) {} diff --git a/src/silo_api/info_handler.cpp b/src/silo_api/info_handler.cpp index 2e3fb6158..592125d51 100644 --- a/src/silo_api/info_handler.cpp +++ b/src/silo_api/info_handler.cpp @@ -25,7 +25,7 @@ void to_json(nlohmann::json& json, const BitmapContainerSizeStatistic& statistic json = nlohmann::json{ {"numberOfArrayContainers", statistics.number_of_array_containers}, {"numberOfRunContainers", statistics.number_of_run_containers}, - {"numberOfBitsetContainers", statistics.number_of_run_containers}, + {"numberOfBitsetContainers", statistics.number_of_bitset_containers}, {"numberOfValuesStoredInArrayContainers", statistics.number_of_values_stored_in_array_containers}, {"numberOfValuesStoredInRunContainers", statistics.number_of_values_stored_in_run_containers},