-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: implement insertion columns and search
- Loading branch information
1 parent
c14a370
commit 9167236
Showing
25 changed files
with
845 additions
and
24 deletions.
There are no files selected for viewing
17 changes: 17 additions & 0 deletions
17
endToEndTests/test/invalidQueries/insertionContains_invalidPattern.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with invalid pattern", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": "CC+++" | ||
} | ||
}, | ||
"expectedError": { | ||
"error": "Bad request", | ||
"message": "The field 'value' in the InsertionContains expression does not contain a valid regex pattern: \"CC+++\"" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with exact match CCC", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": "CCC" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 17 | ||
} | ||
] | ||
} |
18 changes: 18 additions & 0 deletions
18
endToEndTests/test/queries/insertionContains_not_exact1.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with non-exact match .*GCT.*GGT.*", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": ".*GCT.*GGT.*" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 1 | ||
} | ||
] | ||
} |
18 changes: 18 additions & 0 deletions
18
endToEndTests/test/queries/insertionContains_not_exact2.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with non-exact match CAG.*AA", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": "CAG.*AA" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 1 | ||
} | ||
] | ||
} |
18 changes: 18 additions & 0 deletions
18
endToEndTests/test/queries/insertionContains_not_exact3.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with non-exact match TCAG.*AA", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": "TCAG.*AA" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 0 | ||
} | ||
] | ||
} |
18 changes: 18 additions & 0 deletions
18
endToEndTests/test/queries/insertionContains_not_exact4.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"testCaseName": "Insertion Contains with non-exact match CC.*", | ||
"query": { | ||
"action": { | ||
"type": "Aggregated" | ||
}, | ||
"filterExpression": { | ||
"type": "InsertionContains", | ||
"column": "insertions", | ||
"value": "CC.*" | ||
} | ||
}, | ||
"expectedQueryResult": [ | ||
{ | ||
"count": 17 | ||
} | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
#ifndef SILO_NUMERIC_CONVERSION_H | ||
#define SILO_NUMERIC_CONVERSION_H | ||
|
||
#include <stdexcept> | ||
#include <string> | ||
|
||
namespace silo { | ||
|
||
class NumericConversionException : public std::runtime_error { | ||
public: | ||
explicit NumericConversionException(const std::string& error_message); | ||
}; | ||
|
||
uint32_t tryConvertStringToU32(const std::string& input); | ||
|
||
} // namespace silo | ||
|
||
#endif // SILO_NUMERIC_CONVERSION_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
#ifndef SILO_STRING_UTILS_H | ||
#define SILO_STRING_UTILS_H | ||
|
||
#include <string> | ||
#include <string_view> | ||
#include <vector> | ||
|
||
namespace silo { | ||
|
||
std::vector<std::string> splitBy(const std::string& value, const std::string_view delimiter); | ||
|
||
} // namespace silo | ||
|
||
#endif // SILO_STRING_UTILS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#ifndef SILO_BITMAP_PRODUCER_H | ||
#define SILO_BITMAP_PRODUCER_H | ||
|
||
#include <cstdint> | ||
#include <functional> | ||
#include <memory> | ||
#include <string> | ||
|
||
#include "silo/query_engine/operator_result.h" | ||
#include "silo/query_engine/operators/operator.h" | ||
|
||
namespace roaring { | ||
class Roaring; | ||
} // namespace roaring | ||
|
||
namespace silo::query_engine::operators { | ||
|
||
class BitmapProducer : public Operator { | ||
private: | ||
std::function<OperatorResult()> producer; | ||
uint32_t row_count; | ||
|
||
public: | ||
explicit BitmapProducer(std::function<OperatorResult()> producer, uint32_t row_count); | ||
|
||
~BitmapProducer() noexcept override; | ||
|
||
[[nodiscard]] virtual Type type() const override; | ||
|
||
virtual OperatorResult evaluate() const override; | ||
|
||
virtual std::string toString() const override; | ||
|
||
virtual std::unique_ptr<Operator> copy() const override; | ||
|
||
virtual std::unique_ptr<Operator> negate() const override; | ||
}; | ||
|
||
} // namespace silo::query_engine::operators | ||
|
||
#endif // SILO_BITMAP_PRODUCER_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
#ifndef SILO_INSERTION_INDEX_H | ||
#define SILO_INSERTION_INDEX_H | ||
|
||
#include <array> | ||
#include <memory> | ||
#include <regex> | ||
#include <string> | ||
#include <unordered_map> | ||
#include <vector> | ||
|
||
#include <boost/archive/binary_iarchive.hpp> | ||
#include <boost/archive/binary_oarchive.hpp> | ||
#include <roaring/roaring.hh> | ||
|
||
#include "silo/common/nucleotide_symbols.h" | ||
|
||
namespace boost::serialization { | ||
struct access; | ||
} // namespace boost::serialization | ||
|
||
namespace silo::storage::column::insertion { | ||
|
||
class InsertionIndex { | ||
friend class boost::serialization::access; | ||
|
||
public: | ||
using three_mer_t = std::array<size_t, 3>; | ||
using sequence_ids_t = std::vector<uint32_t>; | ||
|
||
private: | ||
template <class Archive> | ||
[[maybe_unused]] void serialize(Archive& archive, const uint32_t /* version */) { | ||
// clang-format off | ||
archive& insertion_positions; | ||
// clang-format on | ||
} | ||
|
||
struct Insertion { | ||
std::string value; | ||
sequence_ids_t sequence_ids; | ||
}; | ||
|
||
using one_mer_index_t = std::array<sequence_ids_t, NUC_SYMBOL_COUNT>; | ||
using two_mer_index_t = std::array<one_mer_index_t, NUC_SYMBOL_COUNT>; | ||
using three_mer_index_t = std::array<two_mer_index_t, NUC_SYMBOL_COUNT>; | ||
|
||
struct InsertionPosition { | ||
uint32_t position; | ||
std::vector<Insertion> insertions; | ||
three_mer_index_t three_mer_index; | ||
|
||
void buildThreeMerIndex(); | ||
|
||
sequence_ids_t searchWithThreeMerIndex( | ||
const std::vector<three_mer_t>& search_three_mers, | ||
const std::regex& search_pattern | ||
) const; | ||
}; | ||
|
||
std::vector<InsertionPosition> insertion_positions; | ||
std::unordered_map<uint32_t, std::unordered_map<std::string, sequence_ids_t>> | ||
collected_insertions; | ||
|
||
public: | ||
void addLazily(const std::string& insertions_string, uint32_t sequence_id); | ||
|
||
void buildIndex(); | ||
|
||
std::unique_ptr<roaring::Roaring> search(const std::string& search_pattern) const; | ||
}; | ||
|
||
} // namespace silo::storage::column::insertion | ||
|
||
#endif // SILO_INSERTION_INDEX_H |
Oops, something went wrong.