From e0670629b1a40dc0e56a31e15eaa42e51a350d66 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Mon, 21 Aug 2023 10:35:15 +0200 Subject: [PATCH] feat: insertions action --- .../insertionsInvalidColumn.json | 17 ++ .../insertionsInvalidSequence.json | 17 ++ .../queries/insertionsActionAndFilter.json | 23 ++ .../test/queries/insertionsColumn.json | 38 +++ include/silo/query_engine/actions/action.h | 2 +- .../silo/query_engine/actions/insertions.h | 60 ++++ .../silo/storage/column/insertion_column.h | 5 +- include/silo/storage/column/insertion_index.h | 2 + src/silo/database.cpp | 4 +- .../query_engine/actions/aa_mutations.cpp | 2 - src/silo/query_engine/actions/action.cpp | 15 +- src/silo/query_engine/actions/insertions.cpp | 285 ++++++++++++++++++ .../query_engine/actions/nuc_mutations.cpp | 1 - src/silo/storage/column/insertion_column.cpp | 8 +- .../storage/column/insertion_column.test.cpp | 4 +- src/silo/storage/column/insertion_index.cpp | 6 + 16 files changed, 478 insertions(+), 11 deletions(-) create mode 100644 endToEndTests/test/invalidQueries/insertionsInvalidColumn.json create mode 100644 endToEndTests/test/invalidQueries/insertionsInvalidSequence.json create mode 100644 endToEndTests/test/queries/insertionsActionAndFilter.json create mode 100644 endToEndTests/test/queries/insertionsColumn.json diff --git a/endToEndTests/test/invalidQueries/insertionsInvalidColumn.json b/endToEndTests/test/invalidQueries/insertionsInvalidColumn.json new file mode 100644 index 000000000..fb7b83320 --- /dev/null +++ b/endToEndTests/test/invalidQueries/insertionsInvalidColumn.json @@ -0,0 +1,17 @@ +{ + "testCaseName": "The insertions action with an invalid column", + "query": { + "action": { + "type": "Insertions", + "column": "insertionsThatAreNotThere", + "sequenceName": "anything" + }, + "filterExpression": { + "type": "True" + } + }, + "expectedError": { + "error": "Bad request", + "message": "The column 'insertionsThatAreNotThere' does not exist." + } +} diff --git a/endToEndTests/test/invalidQueries/insertionsInvalidSequence.json b/endToEndTests/test/invalidQueries/insertionsInvalidSequence.json new file mode 100644 index 000000000..e9c8b16b3 --- /dev/null +++ b/endToEndTests/test/invalidQueries/insertionsInvalidSequence.json @@ -0,0 +1,17 @@ +{ + "testCaseName": "The insertions action with an invalid sequence", + "query": { + "action": { + "type": "Insertions", + "column": "insertions", + "sequenceName": "S" + }, + "filterExpression": { + "type": "True" + } + }, + "expectedError": { + "error": "Bad request", + "message": "The column 'insertions' does not contain the sequence 'S'" + } +} diff --git a/endToEndTests/test/queries/insertionsActionAndFilter.json b/endToEndTests/test/queries/insertionsActionAndFilter.json new file mode 100644 index 000000000..5eccca02b --- /dev/null +++ b/endToEndTests/test/queries/insertionsActionAndFilter.json @@ -0,0 +1,23 @@ +{ + "testCaseName": "The insertions action and insertions contains filter", + "query": { + "action": { + "type": "Insertions", + "column": "insertions" + }, + "filterExpression": { + "type": "InsertionContains", + "column": "insertions", + "position": 22339, + "value": ".*C.*G.*" + } + }, + "expectedQueryResult": [ + { + "count": 1, + "insertions": "GCTGGT", + "position": "22340", + "sequenceName": "" + } + ] +} diff --git a/endToEndTests/test/queries/insertionsColumn.json b/endToEndTests/test/queries/insertionsColumn.json new file mode 100644 index 000000000..6268fa29a --- /dev/null +++ b/endToEndTests/test/queries/insertionsColumn.json @@ -0,0 +1,38 @@ +{ + "testCaseName": "The insertions action", + "query": { + "action": { + "type": "Insertions", + "column": "insertions" + }, + "filterExpression": { + "type": "True" + } + }, + "expectedQueryResult": [ + { + "count": 1, + "insertions": "TAT", + "position": "5960", + "sequenceName": "" + }, + { + "count": 1, + "insertions": "CAGAA", + "position": "22205", + "sequenceName": "" + }, + { + "count": 1, + "insertions": "GCTGGT", + "position": "22340", + "sequenceName": "" + }, + { + "count": 17, + "insertions": "CCC", + "position": "25702", + "sequenceName": "" + } + ] +} diff --git a/include/silo/query_engine/actions/action.h b/include/silo/query_engine/actions/action.h index 427988f1d..a1d0aeead 100644 --- a/include/silo/query_engine/actions/action.h +++ b/include/silo/query_engine/actions/action.h @@ -36,7 +36,7 @@ class Action { void applySort(QueryResult& result) const; void applyOffsetAndLimit(QueryResult& result) const; - [[nodiscard]] virtual void validateOrderByFields(const Database& database) const = 0; + virtual void validateOrderByFields(const Database& database) const = 0; [[nodiscard]] virtual QueryResult execute( const Database& database, diff --git a/include/silo/query_engine/actions/insertions.h b/include/silo/query_engine/actions/insertions.h index 2c583397e..e7e8f692a 100644 --- a/include/silo/query_engine/actions/insertions.h +++ b/include/silo/query_engine/actions/insertions.h @@ -1,4 +1,64 @@ #ifndef SILO_INSERTIONS_H #define SILO_INSERTIONS_H +#include "silo/query_engine/actions/action.h" +#include "silo/storage/column/insertion_index.h" + +namespace silo::query_engine { + +struct QueryResultEntry; + +namespace actions { + +template +class InsertionAggregation : public Action { + static constexpr std::string_view POSITION_FIELD_NAME = "position"; + static constexpr std::string_view INSERTION_FIELD_NAME = "insertions"; + static constexpr std::string_view SEQUENCE_FIELD_NAME = "sequenceName"; + static constexpr std::string_view COUNT_FIELD_NAME = "count"; + + std::string column_name; + std::vector sequence_names; + + struct PrefilteredBitmaps { + std::vector&>> + bitmaps; + std::vector&>> + full_bitmaps; + }; + + void addAggregatedInsertionsToInsertionCounts( + std::vector& output, + const std::string& sequence_name, + const PrefilteredBitmaps& prefiltered_bitmaps + ) const; + + std::unordered_map::PrefilteredBitmaps> + validateFieldsAndPreFilterBitmaps( + const Database& database, + std::vector& bitmap_filter + ) const; + + public: + InsertionAggregation(std::string column, std::vector&& sequence_names); + + void validateOrderByFields(const Database& database) const override; + + [[nodiscard]] QueryResult execute( + const Database& database, + std::vector bitmap_filter + ) const override; +}; + +template +// NOLINTNEXTLINE(readability-identifier-naming) +void from_json(const nlohmann::json& json, std::unique_ptr>& action); + +} // namespace actions +} // namespace silo::query_engine + #endif // SILO_INSERTIONS_H diff --git a/include/silo/storage/column/insertion_column.h b/include/silo/storage/column/insertion_column.h index 703c17fa5..305df5d7c 100644 --- a/include/silo/storage/column/insertion_column.h +++ b/include/silo/storage/column/insertion_column.h @@ -43,7 +43,10 @@ class InsertionColumnPartition { void insert(const std::string& value); - void buildInsertionIndex(); + void buildInsertionIndexes(); + + const std::unordered_map>& getInsertionIndexes( + ) const; [[nodiscard]] std::unique_ptr search( const std::string& sequence_name, diff --git a/include/silo/storage/column/insertion_index.h b/include/silo/storage/column/insertion_index.h index d2413a2ff..66ca0133e 100644 --- a/include/silo/storage/column/insertion_index.h +++ b/include/silo/storage/column/insertion_index.h @@ -96,6 +96,8 @@ class InsertionIndex { void buildIndex(); + const std::unordered_map>& getInsertionPositions() const; + std::unique_ptr search(uint32_t position, const std::string& search_pattern) const; }; diff --git a/src/silo/database.cpp b/src/silo/database.cpp index 3aeea680e..210a3a1bc 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -762,10 +762,10 @@ void Database::initializeAASequences( void Database::finalizeInsertionIndexes() { tbb::parallel_for_each(partitions.begin(), partitions.end(), [](auto& partition) { for (auto& insertion_column : partition.columns.nuc_insertion_columns) { - insertion_column.second.buildInsertionIndex(); + insertion_column.second.buildInsertionIndexes(); } for (auto& insertion_column : partition.columns.aa_insertion_columns) { - insertion_column.second.buildInsertionIndex(); + insertion_column.second.buildInsertionIndexes(); } }); } diff --git a/src/silo/query_engine/actions/aa_mutations.cpp b/src/silo/query_engine/actions/aa_mutations.cpp index 6e00f9597..8fa44c812 100644 --- a/src/silo/query_engine/actions/aa_mutations.cpp +++ b/src/silo/query_engine/actions/aa_mutations.cpp @@ -178,8 +178,6 @@ QueryResult AAMutations::execute( const Database& database, std::vector bitmap_filter ) const { - using roaring::Roaring; - std::vector aa_sequence_names_to_evaluate; for (const auto& aa_sequence_name : aa_sequence_names) { CHECK_SILO_QUERY( diff --git a/src/silo/query_engine/actions/action.cpp b/src/silo/query_engine/actions/action.cpp index fbfa990bf..ad24ef7a1 100644 --- a/src/silo/query_engine/actions/action.cpp +++ b/src/silo/query_engine/actions/action.cpp @@ -14,6 +14,7 @@ #include "silo/query_engine/actions/details.h" #include "silo/query_engine/actions/fasta.h" #include "silo/query_engine/actions/fasta_aligned.h" +#include "silo/query_engine/actions/insertions.h" #include "silo/query_engine/actions/nuc_mutations.h" #include "silo/query_engine/operator_result.h" #include "silo/query_engine/query_parse_exception.h" @@ -125,7 +126,15 @@ void from_json(const nlohmann::json& json, OrderByField& field) { "' must be either a string or an object containing the fields 'field':string and " "'order':string, where the value of order is 'ascending' or 'descending'" ) - field = {json["field"].get(), json["order"].get() == "ascending"}; + const std::string field_name = json["field"].get(); + const std::string order_string = json["order"].get(); + CHECK_SILO_QUERY( + order_string == "ascending" || order_string == "descending", + "The orderByField '" + json.dump() + + "' must be either a string or an object containing the fields 'field':string and " + "'order':string, where the value of order is 'ascending' or 'descending'" + ) + field = {field_name, json["order"].get() == "ascending"}; } // NOLINTNEXTLINE(readability-identifier-naming) @@ -148,6 +157,10 @@ void from_json(const nlohmann::json& json, std::unique_ptr& action) { action = json.get>(); } else if (expression_type == "FastaAligned") { action = json.get>(); + } else if (expression_type == "Insertions") { + action = json.get>>(); + } else if (expression_type == "AminoAcidInsertions") { + action = json.get>>(); } else { throw QueryParseException(expression_type + " is not a valid action"); } diff --git a/src/silo/query_engine/actions/insertions.cpp b/src/silo/query_engine/actions/insertions.cpp index e69de29bb..36ae83dda 100644 --- a/src/silo/query_engine/actions/insertions.cpp +++ b/src/silo/query_engine/actions/insertions.cpp @@ -0,0 +1,285 @@ +#include "silo/query_engine/actions/insertions.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "silo/config/database_config.h" +#include "silo/database.h" +#include "silo/query_engine/operator_result.h" +#include "silo/query_engine/query_parse_exception.h" +#include "silo/query_engine/query_result.h" +#include "silo/storage/column/insertion_column.h" +#include "silo/storage/column/insertion_index.h" + +using silo::query_engine::OperatorResult; + +namespace silo::query_engine::actions { + +template +InsertionAggregation::InsertionAggregation( + std::string column, + std::vector&& sequence_names +) + : column_name(std::move(column)), + sequence_names(std::move(sequence_names)) {} + +template +void InsertionAggregation::validateOrderByFields(const Database& /*database*/) const { + const std::vector result_field_names{ + {std::string{POSITION_FIELD_NAME}, + std::string{INSERTION_FIELD_NAME}, + std::string{SEQUENCE_FIELD_NAME}, + std::string{COUNT_FIELD_NAME}}}; + + for (const OrderByField& field : order_by_fields) { + CHECK_SILO_QUERY( + std::any_of( + result_field_names.begin(), + result_field_names.end(), + [&](const std::string& result_field) { return result_field == field.name; } + ), + "OrderByField " + field.name + " is not contained in the result of this operation." + ) + } +} + +template <> +std::unordered_map::PrefilteredBitmaps> +InsertionAggregation::validateFieldsAndPreFilterBitmaps( + const Database& database, + std::vector& bitmap_filter +) const { + CHECK_SILO_QUERY( + database.columns.aa_insertion_columns.contains(column_name), + "The column " + column_name + " does not exist." + ) + std::unordered_map bitmaps_to_evaluate; + for (size_t i = 0; i < database.partitions.size(); ++i) { + const DatabasePartition& database_partition = database.partitions.at(i); + const auto& insertion_indexes = + database_partition.columns.aa_insertion_columns.at(column_name).getInsertionIndexes(); + OperatorResult& filter = bitmap_filter[i]; + + for (const auto& sequence_name : sequence_names) { + CHECK_SILO_QUERY( + insertion_indexes.contains(sequence_name), + "The column '" + column_name + "' does not contain the sequence '" + sequence_name + "'" + ) + } + + const size_t cardinality = filter->cardinality(); + if (cardinality == 0) { + continue; + } + if (cardinality == database_partition.sequence_count) { + for (const auto& [sequence_name, sequence_index] : insertion_indexes) { + if(sequence_names.empty() || + std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){ + bitmaps_to_evaluate[sequence_name].full_bitmaps.emplace_back(filter, sequence_index); + } + } + } else { + if (filter.isMutable()) { + filter->runOptimize(); + } + for (const auto& [sequence_name, sequence_index] : insertion_indexes) { + if(sequence_names.empty() || + std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){ + bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_index); + } + } + } + } + return bitmaps_to_evaluate; +} + +template <> +std::unordered_map::PrefilteredBitmaps> +InsertionAggregation::validateFieldsAndPreFilterBitmaps( + const Database& database, + std::vector& bitmap_filter +) const { + CHECK_SILO_QUERY( + database.columns.nuc_insertion_columns.contains(column_name), + "The column '" + column_name + "' does not exist." + ) + + std::unordered_map bitmaps_to_evaluate; + for (size_t i = 0; i < database.partitions.size(); ++i) { + const DatabasePartition& database_partition = database.partitions.at(i); + const auto& insertion_indexes = + database_partition.columns.nuc_insertion_columns.at(column_name).getInsertionIndexes(); + OperatorResult& filter = bitmap_filter[i]; + + for (const auto& sequence_name : sequence_names) { + CHECK_SILO_QUERY( + insertion_indexes.contains(sequence_name), + "The column '" + column_name + "' does not contain the sequence '" + sequence_name + "'" + ) + } + + const size_t cardinality = filter->cardinality(); + if (cardinality == 0) { + continue; + } + if (cardinality == database_partition.sequence_count) { + for (const auto& [sequence_name, sequence_index] : insertion_indexes) { + if(sequence_names.empty() || + std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){ + bitmaps_to_evaluate[sequence_name].full_bitmaps.emplace_back(filter, sequence_index); + } + } + } else { + if (filter.isMutable()) { + filter->runOptimize(); + } + for (const auto& [sequence_name, sequence_index] : insertion_indexes) { + if(sequence_names.empty() || + std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){ + bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_index); + } + } + } + } + return bitmaps_to_evaluate; +} + +struct PositionAndInsertion { + uint32_t position; + std::string_view insertion_value; + + bool operator==(const PositionAndInsertion& other) const { + return position == other.position && insertion_value == other.insertion_value; + } +}; +} // namespace silo::query_engine::actions + +using silo::query_engine::actions::PositionAndInsertion; + +template <> +struct std::hash { + std::size_t operator()(const PositionAndInsertion& position_and_insertion) const noexcept { + std::size_t seed = 0; + boost::hash_combine(seed, position_and_insertion.position); + boost::hash_combine(seed, position_and_insertion.insertion_value); + return seed; + } +}; + +namespace silo::query_engine::actions { + +template +void InsertionAggregation::addAggregatedInsertionsToInsertionCounts( + std::vector& output, + const std::string& sequence_name, + const PrefilteredBitmaps& prefiltered_bitmaps +) const { + std::unordered_map all_insertions; + for (const auto& [_, insertion_index] : prefiltered_bitmaps.full_bitmaps) { + for (const auto& [position, insertions_at_position] : + insertion_index.getInsertionPositions()) { + for (const auto& insertion : insertions_at_position.insertions) { + all_insertions[PositionAndInsertion{position, insertion.value}] += + insertion.sequence_ids.cardinality(); + } + } + } + for (const auto& [bitmap_filter, insertion_index] : prefiltered_bitmaps.bitmaps) { + for (const auto& [position, insertions_at_position] : + insertion_index.getInsertionPositions()) { + for (const auto& insertion : insertions_at_position.insertions) { + const uint32_t count = insertion.sequence_ids.and_cardinality(*bitmap_filter); + if (count > 0) { + all_insertions[PositionAndInsertion{position, insertion.value}] += count; + } + } + } + } + for (const auto& [position_and_insertion, count] : all_insertions) { + const std::map>> fields{ + {std::string(POSITION_FIELD_NAME), std::to_string(position_and_insertion.position + 1)}, + {std::string(SEQUENCE_FIELD_NAME), sequence_name}, + {std::string(INSERTION_FIELD_NAME), std::string(position_and_insertion.insertion_value)}, + {std::string(COUNT_FIELD_NAME), static_cast(count)}}; + output.push_back({fields}); + } +} + +template +QueryResult InsertionAggregation::execute( + const Database& database, + std::vector bitmap_filter +) const { + using storage::column::insertion::InsertionIndex; + + std::unordered_map::PrefilteredBitmaps> + bitmaps_to_evaluate = validateFieldsAndPreFilterBitmaps(database, bitmap_filter); + + std::vector insertion_counts; + for (const auto& [sequence_name, prefiltered_bitmaps] : bitmaps_to_evaluate) { + addAggregatedInsertionsToInsertionCounts( + insertion_counts, sequence_name, prefiltered_bitmaps + ); + } + return {insertion_counts}; +} + +template +// NOLINTNEXTLINE(readability-identifier-naming) +void from_json(const nlohmann::json& json, std::unique_ptr>& action) { + CHECK_SILO_QUERY( + !json.contains("sequenceName") || + (json["sequenceName"].is_string() || json["sequenceName"].is_array()), + "Insertions action can have the field sequenceName of type string or an array of " + "strings, but no other type" + ) + std::vector sequence_names; + if (json.contains("sequenceName") && json["sequenceName"].is_array()) { + for (const auto& child : json["sequenceName"]) { + CHECK_SILO_QUERY( + child.is_string(), + "The field sequenceName of the Insertions action must have type string or an " + "array, if present. Found:" + + child.dump() + ) + sequence_names.emplace_back(child.get()); + } + } else if (json.contains("sequenceName") && json["sequenceName"].is_string()) { + sequence_names.emplace_back(json["sequenceName"].get()); + } + + CHECK_SILO_QUERY( + json.contains("column") && json["column"].is_string(), + "Insertions must have the field 'column' of type string" + ) + const std::string column = json["column"].get(); + + action = std::make_unique>(column, std::move(sequence_names)); +} + +template void from_json( + const nlohmann::json& json, + std::unique_ptr>& action +); + +template void from_json( + const nlohmann::json& json, + std::unique_ptr>& action +); + +template class InsertionAggregation; +template class InsertionAggregation; + +} // namespace silo::query_engine::actions diff --git a/src/silo/query_engine/actions/nuc_mutations.cpp b/src/silo/query_engine/actions/nuc_mutations.cpp index db5180119..0f4749414 100644 --- a/src/silo/query_engine/actions/nuc_mutations.cpp +++ b/src/silo/query_engine/actions/nuc_mutations.cpp @@ -132,7 +132,6 @@ QueryResult NucMutations::execute( const Database& database, std::vector bitmap_filter ) const { - using roaring::Roaring; const std::string nuc_sequence_name_or_default = nuc_sequence_name.value_or(database.database_config.default_nucleotide_sequence); CHECK_SILO_QUERY( diff --git a/src/silo/storage/column/insertion_column.cpp b/src/silo/storage/column/insertion_column.cpp index 2fbb9207a..939fd9867 100644 --- a/src/silo/storage/column/insertion_column.cpp +++ b/src/silo/storage/column/insertion_column.cpp @@ -65,12 +65,18 @@ void InsertionColumnPartition::insert(const std::string& value) { } template -void InsertionColumnPartition::buildInsertionIndex() { +void InsertionColumnPartition::buildInsertionIndexes() { for (auto& [_, insertion_index] : insertion_indexes) { insertion_index.buildIndex(); } } +template +const std::unordered_map>& InsertionColumnPartition< + Symbol>::getInsertionIndexes() const { + return insertion_indexes; +} + template std::unique_ptr InsertionColumnPartition::search( const std::string& sequence_name, diff --git a/src/silo/storage/column/insertion_column.test.cpp b/src/silo/storage/column/insertion_column.test.cpp index 008bfa64b..0d08b7f80 100644 --- a/src/silo/storage/column/insertion_column.test.cpp +++ b/src/silo/storage/column/insertion_column.test.cpp @@ -14,7 +14,7 @@ TEST(InsertionColumn, insertValuesToPartition) { under_test.insert("19832:TTACA"); under_test.insert("25701:ACCA"); - under_test.buildInsertionIndex(); + under_test.buildInsertionIndexes(); EXPECT_EQ(under_test.getValues()[0], 0U); EXPECT_EQ(under_test.getValues()[1], 1U); @@ -38,7 +38,7 @@ TEST(InsertionColumn, shouldReturnTheCorrectSearchedValues) { under_test.insert("25701:ACCA"); under_test.insert("25701:TTACAT,25701:ACCA,25701:AGCTGTTCAG"); - under_test.buildInsertionIndex(); + under_test.buildInsertionIndexes(); const auto result1 = under_test.search("", 25701, ".*CC.*"); ASSERT_EQ(*result1, roaring::Roaring({0, 1, 2, 3, 4, 5})); diff --git a/src/silo/storage/column/insertion_index.cpp b/src/silo/storage/column/insertion_index.cpp index f77edbf41..725a3edc1 100644 --- a/src/silo/storage/column/insertion_index.cpp +++ b/src/silo/storage/column/insertion_index.cpp @@ -273,6 +273,12 @@ void InsertionIndex::buildIndex() { collected_insertions.clear(); } +template +const std::unordered_map>& InsertionIndex< + Symbol>::getInsertionPositions() const { + return insertion_positions; +} + template std::unique_ptr InsertionIndex::search( uint32_t position,