Skip to content

Commit

Permalink
feat: insertion action targets all insertion columns by default
Browse files Browse the repository at this point in the history
  • Loading branch information
Taepper committed Aug 30, 2023
1 parent 6b61985 commit 6b70241
Show file tree
Hide file tree
Showing 9 changed files with 140 additions and 87 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
},
"expectedError": {
"error": "Bad request",
"message": "The column 'insertions' does not contain the sequence 'S'"
"message": "The database does not contain the Nucleotide sequence 'S'"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
},
"expectedError": {
"error": "Bad request",
"message": "The column 'insertionsThatAreNotThere' does not exist."
"message": "The database does not contain the Nucleotide column 'insertionsThatAreNotThere'"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@
},
"expectedError": {
"error": "Bad request",
"message": "The column 'insertions' does not contain the sequence 'S'"
"message": "The database does not contain the Nucleotide sequence 'notAValidSequence'"
}
}
3 changes: 3 additions & 0 deletions include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ class Database {
template <typename SymbolType>
std::optional<std::string> getDefaultSequenceName() const;

template <typename SymbolType>
std::vector<std::string> getSequenceNames() const;

virtual query_engine::QueryResult executeQuery(const std::string& query) const;

private:
Expand Down
15 changes: 13 additions & 2 deletions include/silo/query_engine/actions/insertions.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define SILO_INSERTIONS_H

#include "silo/query_engine/actions/action.h"
#include "silo/storage/column/insertion_column.h"
#include "silo/storage/column/insertion_index.h"

namespace silo::query_engine {
Expand All @@ -17,7 +18,7 @@ class InsertionAggregation : public Action {
static constexpr std::string_view SEQUENCE_FIELD_NAME = "sequenceName";
static constexpr std::string_view COUNT_FIELD_NAME = "count";

std::string column_name;
std::vector<std::string> column_names;
std::vector<std::string> sequence_names;

struct PrefilteredBitmaps {
Expand All @@ -31,6 +32,13 @@ class InsertionAggregation : public Action {
full_bitmaps;
};

void addAllColumnIndexesToPreFilteredBitmaps(
const silo::storage::column::InsertionColumnPartition<SymbolType>& column,
const OperatorResult& filter,
std::unordered_map<std::string, InsertionAggregation<SymbolType>::PrefilteredBitmaps>&
bitmaps_to_evaluate
) const;

void addAggregatedInsertionsToInsertionCounts(
std::vector<QueryResultEntry>& output,
const std::string& sequence_name,
Expand All @@ -44,7 +52,10 @@ class InsertionAggregation : public Action {
) const;

public:
InsertionAggregation(std::string column, std::vector<std::string>&& sequence_names);
InsertionAggregation(
std::vector<std::string>&& column,
std::vector<std::string>&& sequence_names
);

void validateOrderByFields(const Database& database) const override;

Expand Down
8 changes: 6 additions & 2 deletions include/silo/storage/column_group.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ class ColumnPartitionGroup {
uint32_t sequence_id
) const;

template <typename SymbolType>
const std::map<std::string, storage::column::InsertionColumnPartition<SymbolType>&>&
template <typename Symbol>
const std::map<std::string, storage::column::InsertionColumnPartition<Symbol>&>&
getInsertionColumns() const;
};

Expand Down Expand Up @@ -148,6 +148,10 @@ class ColumnGroup {
std::map<std::string, storage::column::PangoLineageColumn> pango_lineage_columns;
std::map<std::string, storage::column::InsertionColumn<Nucleotide>> nuc_insertion_columns;
std::map<std::string, storage::column::InsertionColumn<AminoAcid>> aa_insertion_columns;

template <typename SymbolType>
const std::map<std::string, storage::column::InsertionColumn<SymbolType>>& getInsertionColumns(
) const;
};

} // namespace silo::storage
Expand Down
18 changes: 18 additions & 0 deletions src/silo/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,24 @@ std::optional<std::string> Database::getDefaultSequenceName<AminoAcid>() const {
return std::nullopt;
}

template <>
std::vector<std::string> Database::getSequenceNames<Nucleotide>() const {
std::vector<std::string> sequence_names;
for (const auto& [name, _] : nuc_sequences) {
sequence_names.emplace_back(name);
}
return sequence_names;
}

template <>
std::vector<std::string> Database::getSequenceNames<AminoAcid>() const {
std::vector<std::string> sequence_names;
for (const auto& [name, _] : aa_sequences) {
sequence_names.emplace_back(name);
}
return sequence_names;
}

const PangoLineageAliasLookup& Database::getAliasKey() const {
return alias_key;
}
Expand Down
166 changes: 86 additions & 80 deletions src/silo/query_engine/actions/insertions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ namespace silo::query_engine::actions {

template <typename SymbolType>
InsertionAggregation<SymbolType>::InsertionAggregation(
std::string column,
std::vector<std::string>&& column_names,
std::vector<std::string>&& sequence_names
)
: column_name(std::move(column)),
: column_names(std::move(column_names)),
sequence_names(std::move(sequence_names)) {}

template <typename SymbolType>
Expand All @@ -55,98 +55,88 @@ void InsertionAggregation<SymbolType>::validateOrderByFields(const Database& /*d
}
}

template <>
std::unordered_map<std::string, InsertionAggregation<AminoAcid>::PrefilteredBitmaps>
InsertionAggregation<AminoAcid>::validateFieldsAndPreFilterBitmaps(
const Database& database,
std::vector<OperatorResult>& bitmap_filter
) const {
CHECK_SILO_QUERY(
database.columns.aa_insertion_columns.contains(column_name),
"The column " + column_name + " does not exist."
)
std::unordered_map<std::string, PrefilteredBitmaps> bitmaps_to_evaluate;
for (size_t i = 0; i < database.partitions.size(); ++i) {
const DatabasePartition& database_partition = database.partitions.at(i);
const auto& insertion_indexes =
database_partition.columns.aa_insertion_columns.at(column_name).getInsertionIndexes();
OperatorResult& filter = bitmap_filter[i];
template <typename SymbolType>
void validateColumnNames(
const storage::ColumnPartitionGroup& column_group,
const std::vector<std::string>& column_names
) {
for (std::string column_name : column_names) {
CHECK_SILO_QUERY(
column_group.getInsertionColumns<SymbolType>().contains(column_name),
"The column '" + column_name + "' does not exist."
)
}
}

const size_t cardinality = filter->cardinality();
if (cardinality == 0) {
continue;
}
if (cardinality == database_partition.sequence_count) {
for (const auto& [sequence_name, sequence_index] : insertion_indexes) {
if(sequence_names.empty() ||
std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){
bitmaps_to_evaluate[sequence_name].full_bitmaps.emplace_back(filter, sequence_index);
}
}
} else {
if (filter.isMutable()) {
filter->runOptimize();
}
for (const auto& [sequence_name, sequence_index] : insertion_indexes) {
if(sequence_names.empty() ||
std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){
bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_index);
}
}
template <typename SymbolType>
void InsertionAggregation<SymbolType>::addAllColumnIndexesToPreFilteredBitmaps(
const storage::column::InsertionColumnPartition<SymbolType>& column,
const OperatorResult& filter,
std::unordered_map<std::string, InsertionAggregation<SymbolType>::PrefilteredBitmaps>&
bitmaps_to_evaluate
) const {
for (const auto& [sequence_name, sequence_index] : column.getInsertionIndexes()) {
if(sequence_names.empty() ||
std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){
bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_index);
}
}
return bitmaps_to_evaluate;
}

template <>
std::unordered_map<std::string, InsertionAggregation<Nucleotide>::PrefilteredBitmaps>
InsertionAggregation<Nucleotide>::validateFieldsAndPreFilterBitmaps(
template <typename SymbolType>
std::unordered_map<std::string, typename InsertionAggregation<SymbolType>::PrefilteredBitmaps>
InsertionAggregation<SymbolType>::validateFieldsAndPreFilterBitmaps(
const Database& database,
std::vector<OperatorResult>& bitmap_filter
) const {
CHECK_SILO_QUERY(
database.columns.nuc_insertion_columns.contains(column_name),
"The column '" + column_name + "' does not exist."
)
for (const std::string& column_name : column_names) {
CHECK_SILO_QUERY(
database.columns.getInsertionColumns<SymbolType>().contains(column_name),
"The database does not contain the " + std::string(SymbolType::SYMBOL_NAME) + " column '" +
column_name + "'"
);
}
std::vector<std::string> all_sequence_names = database.getSequenceNames<SymbolType>();
for (const std::string& sequence_name : sequence_names) {
CHECK_SILO_QUERY(
std::find(all_sequence_names.begin(), all_sequence_names.end(), sequence_name) !=
all_sequence_names.end(),
"The database does not contain the " + std::string(SymbolType::SYMBOL_NAME) +
" sequence '" + sequence_name + "'"
);
}

std::unordered_map<std::string, PrefilteredBitmaps> bitmaps_to_evaluate;
std::unordered_map<std::string, PrefilteredBitmaps> pre_filtered_bitmaps;
for (size_t i = 0; i < database.partitions.size(); ++i) {
const DatabasePartition& database_partition = database.partitions.at(i);
const auto& insertion_indexes =
database_partition.columns.nuc_insertion_columns.at(column_name).getInsertionIndexes();
OperatorResult& filter = bitmap_filter[i];

for (const auto& sequence_name : sequence_names) {
CHECK_SILO_QUERY(
insertion_indexes.contains(sequence_name),
"The column '" + column_name + "' does not contain the sequence '" + sequence_name + "'"
)
}
validateColumnNames<SymbolType>(database_partition.columns, column_names);

const size_t cardinality = filter->cardinality();
if (cardinality == 0) {
continue;
}
if (cardinality == database_partition.sequence_count) {
for (const auto& [sequence_name, sequence_index] : insertion_indexes) {
if(sequence_names.empty() ||
std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){
bitmaps_to_evaluate[sequence_name].full_bitmaps.emplace_back(filter, sequence_index);
for (auto& [column_name, insertion_column] :
database_partition.columns.getInsertionColumns<SymbolType>()) {
if(column_names.empty() ||
std::find(column_names.begin(), column_names.end(), column_name) != column_names.end()){
OperatorResult& filter = bitmap_filter[i];
const size_t cardinality = filter->cardinality();
if (cardinality == 0) {
continue;
}
}
} else {
if (filter.isMutable()) {
filter->runOptimize();
}
for (const auto& [sequence_name, sequence_index] : insertion_indexes) {
if(sequence_names.empty() ||
std::find(sequence_names.begin(), sequence_names.end(), sequence_name) != sequence_names.end()){
bitmaps_to_evaluate[sequence_name].bitmaps.emplace_back(filter, sequence_index);
if (cardinality == database_partition.sequence_count) {
addAllColumnIndexesToPreFilteredBitmaps(
insertion_column, filter, pre_filtered_bitmaps
);
} else {
if (filter.isMutable()) {
filter->runOptimize();
}
addAllColumnIndexesToPreFilteredBitmaps(
insertion_column, filter, pre_filtered_bitmaps
);
}
}
}
}
return bitmaps_to_evaluate;
return pre_filtered_bitmaps;
}

struct PositionAndInsertion {
Expand Down Expand Up @@ -257,12 +247,28 @@ void from_json(
}

CHECK_SILO_QUERY(
json.contains("column") && json["column"].is_string(),
"Insertions must have the field 'column' of type string"
!json.contains("column") || (json["column"].is_string() || json["column"].is_array()),
"Insertions action can have the field column of type string or an array of "
"strings, but no other type"
)
const std::string column = json["column"].get<std::string>();
std::vector<std::string> column_names;
if (json.contains("column") && json.at("column").is_array()) {
for (const auto& child : json["column"]) {
CHECK_SILO_QUERY(
child.is_string(),
"The field column of the Insertions action must have type string or an "
"array, if present. Found:" +
child.dump()
)
column_names.emplace_back(child.get<std::string>());
}
} else if (json.contains("column") && json["column"].is_string()) {
column_names.emplace_back(json["column"].get<std::string>());
}

action = std::make_unique<InsertionAggregation<SymbolType>>(column, std::move(sequence_names));
action = std::make_unique<InsertionAggregation<SymbolType>>(
std::move(column_names), std::move(sequence_names)
);
}

template void from_json<AminoAcid>(
Expand Down
11 changes: 11 additions & 0 deletions src/silo/storage/column_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,4 +149,15 @@ std::optional<std::variant<std::string, int32_t, double>> ColumnPartitionGroup::
return std::nullopt;
}

template <>
const std::map<std::string, storage::column::InsertionColumn<Nucleotide>>& ColumnGroup::
getInsertionColumns<Nucleotide>() const {
return nuc_insertion_columns;
}
template <>
const std::map<std::string, storage::column::InsertionColumn<AminoAcid>>& ColumnGroup::
getInsertionColumns<AminoAcid>() const {
return aa_insertion_columns;
}

} // namespace silo::storage

0 comments on commit 6b70241

Please sign in to comment.