Skip to content

Commit

Permalink
feat: implement insertion columns and search
Browse files Browse the repository at this point in the history
  • Loading branch information
danielgrittner committed Jul 16, 2023
1 parent c14a370 commit 9167236
Show file tree
Hide file tree
Showing 25 changed files with 845 additions and 24 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"testCaseName": "Insertion Contains with invalid pattern",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": "CC+++"
}
},
"expectedError": {
"error": "Bad request",
"message": "The field 'value' in the InsertionContains expression does not contain a valid regex pattern: \"CC+++\""
}
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/insertionContains_exact.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "Insertion Contains with exact match CCC",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": "CCC"
}
},
"expectedQueryResult": [
{
"count": 17
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/insertionContains_not_exact1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "Insertion Contains with non-exact match .*GCT.*GGT.*",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": ".*GCT.*GGT.*"
}
},
"expectedQueryResult": [
{
"count": 1
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/insertionContains_not_exact2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "Insertion Contains with non-exact match CAG.*AA",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": "CAG.*AA"
}
},
"expectedQueryResult": [
{
"count": 1
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/insertionContains_not_exact3.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "Insertion Contains with non-exact match TCAG.*AA",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": "TCAG.*AA"
}
},
"expectedQueryResult": [
{
"count": 0
}
]
}
18 changes: 18 additions & 0 deletions endToEndTests/test/queries/insertionContains_not_exact4.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"testCaseName": "Insertion Contains with non-exact match CC.*",
"query": {
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "InsertionContains",
"column": "insertions",
"value": "CC.*"
}
},
"expectedQueryResult": [
{
"count": 17
}
]
}
22 changes: 22 additions & 0 deletions include/silo/common/nucleotide_symbols.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
#ifndef SILO_NUCLEOTIDE_SYMBOLS_H
#define SILO_NUCLEOTIDE_SYMBOLS_H

#include <algorithm>
#include <array>
#include <cstdint>
#include <optional>
#include <stdexcept>
#include <string>
#include <vector>

namespace silo {

Expand Down Expand Up @@ -125,6 +129,24 @@ inline std::optional<NUCLEOTIDE_SYMBOL> charToNucleotideSymbol(char character) {
return std::nullopt;
}
}

inline size_t toNucleotideSymbolId(char c) {
const auto nuc_opt = charToNucleotideSymbol(c);
if (nuc_opt.has_value()) {
return static_cast<size_t>(*nuc_opt);
}
throw std::invalid_argument("Invalid nucleotide symbol: " + c);
}

inline std::vector<size_t> toNucleotideSymbolIds(const std::string& nucleotides) {
std::vector<size_t> result;
result.reserve(nucleotides.size());
std::transform(
nucleotides.begin(), nucleotides.end(), std::back_inserter(result), toNucleotideSymbolId
);
return result;
}

} // namespace silo

#endif // SILO_NUCLEOTIDE_SYMBOLS_H
18 changes: 18 additions & 0 deletions include/silo/common/numeric_conversion.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#ifndef SILO_NUMERIC_CONVERSION_H
#define SILO_NUMERIC_CONVERSION_H

#include <stdexcept>
#include <string>

namespace silo {

class NumericConversionException : public std::runtime_error {
public:
explicit NumericConversionException(const std::string& error_message);
};

uint32_t tryConvertStringToU32(const std::string& input);

} // namespace silo

#endif // SILO_NUMERIC_CONVERSION_H
14 changes: 14 additions & 0 deletions include/silo/common/string_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#ifndef SILO_STRING_UTILS_H
#define SILO_STRING_UTILS_H

#include <string>
#include <string_view>
#include <vector>

namespace silo {

std::vector<std::string> splitBy(const std::string& value, const std::string_view delimiter);

} // namespace silo

#endif // SILO_STRING_UTILS_H
1 change: 1 addition & 0 deletions include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ class Database {
void initializeColumns();
void initializeColumn(config::ColumnType column_type, const std::string& name);
void initializeSequences();
void initializeInsertionIndexes();

static BitmapSizePerSymbol calculateBitmapSizePerSymbol(const SequenceStore& seq_store);

Expand Down
41 changes: 41 additions & 0 deletions include/silo/query_engine/operators/bitmap_producer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#ifndef SILO_BITMAP_PRODUCER_H
#define SILO_BITMAP_PRODUCER_H

#include <cstdint>
#include <functional>
#include <memory>
#include <string>

#include "silo/query_engine/operator_result.h"
#include "silo/query_engine/operators/operator.h"

namespace roaring {
class Roaring;
} // namespace roaring

namespace silo::query_engine::operators {

class BitmapProducer : public Operator {
private:
std::function<OperatorResult()> producer;
uint32_t row_count;

public:
explicit BitmapProducer(std::function<OperatorResult()> producer, uint32_t row_count);

~BitmapProducer() noexcept override;

[[nodiscard]] virtual Type type() const override;

virtual OperatorResult evaluate() const override;

virtual std::string toString() const override;

virtual std::unique_ptr<Operator> copy() const override;

virtual std::unique_ptr<Operator> negate() const override;
};

} // namespace silo::query_engine::operators

#endif // SILO_BITMAP_PRODUCER_H
3 changes: 2 additions & 1 deletion include/silo/query_engine/operators/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ enum Type {
SELECTION,
BITMAP_SELECTION,
THRESHOLD,
UNION
UNION,
BITMAP_PRODUCER
};

class Operator {
Expand Down
22 changes: 9 additions & 13 deletions include/silo/storage/column/insertion_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@

#include "silo/common/bidirectional_map.h"
#include "silo/common/types.h"
#include "silo/storage/column/insertion_index.h"

namespace boost::serialization {
struct access;
}
} // namespace boost::serialization

namespace silo::storage::column {

Expand All @@ -27,27 +28,25 @@ class InsertionColumnPartition {
template <class Archive>
[[maybe_unused]] void serialize(Archive& archive, const uint32_t /* version */) {
// clang-format off
// TODO(#164) serialize data-structures
archive& values;
// clang-format on
}

// TODO(#164) remove this and add special datastructures here
std::vector<silo::Idx> values;
common::BidirectionalMap<std::string>& lookup;
insertion::InsertionIndex insertion_index;

public:
explicit InsertionColumnPartition(
common::BidirectionalMap<std::string>& lookup
// TODO(#164) add datastructures that need to be synchronized across partitions here
);
explicit InsertionColumnPartition(common::BidirectionalMap<std::string>& lookup);

void insert(const std::string& value);

// TODO(#164) Return a type that is byte-wise comparable
void buildInsertionIndex();

[[nodiscard]] std::unique_ptr<roaring::Roaring> search(const std::string& search_pattern) const;

[[nodiscard]] const std::vector<silo::Idx>& getValues() const;

// TODO(#164) Maybe require helper function to return the original string value from the
// internal representation type used for querying
[[nodiscard]] std::string lookupValue(silo::Idx value_id) const;
};

Expand All @@ -58,15 +57,12 @@ class InsertionColumn {
template <class Archive>
[[maybe_unused]] void serialize(Archive& archive, const uint32_t /* version */) {
// clang-format off
// TODO(#164) serialize data-structures
// clang-format on
}

std::deque<InsertionColumnPartition> partitions;
std::unique_ptr<common::BidirectionalMap<std::string>> lookup;

// TODO(#164) synchronized data-structures

public:
InsertionColumn();

Expand Down
74 changes: 74 additions & 0 deletions include/silo/storage/column/insertion_index.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef SILO_INSERTION_INDEX_H
#define SILO_INSERTION_INDEX_H

#include <array>
#include <memory>
#include <regex>
#include <string>
#include <unordered_map>
#include <vector>

#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <roaring/roaring.hh>

#include "silo/common/nucleotide_symbols.h"

namespace boost::serialization {
struct access;
} // namespace boost::serialization

namespace silo::storage::column::insertion {

class InsertionIndex {
friend class boost::serialization::access;

public:
using three_mer_t = std::array<size_t, 3>;
using sequence_ids_t = std::vector<uint32_t>;

private:
template <class Archive>
[[maybe_unused]] void serialize(Archive& archive, const uint32_t /* version */) {
// clang-format off
archive& insertion_positions;
// clang-format on
}

struct Insertion {
std::string value;
sequence_ids_t sequence_ids;
};

using one_mer_index_t = std::array<sequence_ids_t, NUC_SYMBOL_COUNT>;
using two_mer_index_t = std::array<one_mer_index_t, NUC_SYMBOL_COUNT>;
using three_mer_index_t = std::array<two_mer_index_t, NUC_SYMBOL_COUNT>;

struct InsertionPosition {
uint32_t position;
std::vector<Insertion> insertions;
three_mer_index_t three_mer_index;

void buildThreeMerIndex();

sequence_ids_t searchWithThreeMerIndex(
const std::vector<three_mer_t>& search_three_mers,
const std::regex& search_pattern
) const;
};

std::vector<InsertionPosition> insertion_positions;
std::unordered_map<uint32_t, std::unordered_map<std::string, sequence_ids_t>>
collected_insertions;

public:
void addLazily(const std::string& insertions_string, uint32_t sequence_id);

void buildIndex();

std::unique_ptr<roaring::Roaring> search(const std::string& search_pattern) const;
};

} // namespace silo::storage::column::insertion

#endif // SILO_INSERTION_INDEX_H
Loading

0 comments on commit 9167236

Please sign in to comment.