Skip to content

Commit

Permalink
fix: allow all sequence-names by escaping them properly in all SQL st…
Browse files Browse the repository at this point in the history
…atements
  • Loading branch information
Taepper committed Aug 6, 2024
1 parent e897203 commit 901fc7e
Show file tree
Hide file tree
Showing 92 changed files with 949 additions and 461 deletions.
14 changes: 8 additions & 6 deletions include/silo/common/table_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,27 @@

#include <duckdb.hpp>

#include "silo/preprocessing/identifier.h"

namespace silo {

class ColumnFunction {
friend class TableReader;
std::string column_name;
silo::preprocessing::Identifier column_name;
std::function<void(size_t, const duckdb::Vector&, size_t)> function;

public:
ColumnFunction(
std::string column_name,
preprocessing::Identifier column_name,
std::function<void(size_t, const duckdb::Vector&, size_t)> function
);
};

class TableReader {
private:
duckdb::Connection& connection;
std::string table_name;
std::string key_column;
preprocessing::Identifier table_name;
preprocessing::Identifier key_column;
std::vector<ColumnFunction> column_functions;
std::string where_clause;
std::string order_by_clause;
Expand All @@ -38,8 +40,8 @@ class TableReader {
public:
explicit TableReader(
duckdb::Connection& connection,
std::string_view table_name,
std::string_view key_column,
preprocessing::Identifier table_name,
preprocessing::Identifier key_column,
std::vector<ColumnFunction> column_functions,
std::string_view where_clause,
std::string_view order_by_clause
Expand Down
6 changes: 3 additions & 3 deletions include/silo/config/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,12 @@ class PreprocessingConfig {

[[nodiscard]] std::optional<std::filesystem::path> getMetadataInputFilename() const;

[[nodiscard]] std::filesystem::path getNucFilenameNoExtension(std::string_view nuc_name) const;
[[nodiscard]] std::filesystem::path getNucFilenameNoExtension(size_t sequence_idx) const;

[[nodiscard]] std::filesystem::path getUnalignedNucFilenameNoExtension(std::string_view nuc_name
[[nodiscard]] std::filesystem::path getUnalignedNucFilenameNoExtension(size_t sequence_idx
) const;

[[nodiscard]] std::filesystem::path getGeneFilenameNoExtension(std::string_view gene_name) const;
[[nodiscard]] std::filesystem::path getGeneFilenameNoExtension(size_t sequence_idx) const;

[[nodiscard]] std::filesystem::path getNucleotideInsertionsFilename() const;

Expand Down
13 changes: 9 additions & 4 deletions include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ class Database {

silo::storage::ColumnGroup columns;

std::vector<std::string> nuc_sequence_names;
std::vector<std::string> aa_sequence_names;

std::map<std::string, SequenceStore<Nucleotide>> nuc_sequences;
std::map<std::string, SequenceStore<AminoAcid>> aa_sequences;
std::map<std::string, UnalignedSequenceStore> unaligned_nuc_sequences;
Expand Down Expand Up @@ -79,17 +82,19 @@ class Database {
virtual query_engine::QueryResult executeQuery(const std::string& query) const;

private:
std::map<std::string, std::vector<Nucleotide::Symbol>> getNucSequences() const;
std::vector<std::vector<Nucleotide::Symbol>> getNucSequences() const;

std::map<std::string, std::vector<AminoAcid::Symbol>> getAASequences() const;
std::vector<std::vector<AminoAcid::Symbol>> getAASequences() const;

void initializeColumns();
void initializeColumn(config::ColumnType column_type, const std::string& name);
void initializeNucSequences(
const std::map<std::string, std::vector<Nucleotide::Symbol>>& reference_sequences
const std::vector<std::string>& sequence_names,
const std::vector<std::vector<Nucleotide::Symbol>>& reference_sequences
);
void initializeAASequences(
const std::map<std::string, std::vector<AminoAcid::Symbol>>& reference_sequences
const std::vector<std::string>& sequence_names,
const std::vector<std::vector<AminoAcid::Symbol>>& reference_sequences
);

template <typename SymbolType>
Expand Down
21 changes: 21 additions & 0 deletions include/silo/preprocessing/identifier.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#pragma once

#include <string>
#include <vector>

namespace silo::preprocessing {

class Identifier {
std::string raw_identifier;

public:
explicit Identifier(std::string identifier);

static std::string escapeIdentifier(const std::string& identifier);

const std::string& getRawIdentifier() const;

std::string escape() const;
};

} // namespace silo::preprocessing
27 changes: 27 additions & 0 deletions include/silo/preprocessing/identifiers.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <string>
#include <vector>

#include "silo/preprocessing/identifier.h"

namespace silo::preprocessing {

class Identifiers {
std::vector<Identifier> identifiers;

public:
Identifiers(const std::vector<std::string>& raw_identifiers);

Identifiers prefix(const std::string& prefix) const;

size_t size() const;

Identifier getIdentifier(size_t index) const;

std::vector<std::string> getRawIdentifierStrings() const;

std::vector<std::string> getEscapedIdentifierStrings() const;
};

} // namespace silo::preprocessing
6 changes: 2 additions & 4 deletions include/silo/preprocessing/preprocessing_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,8 @@ class Partitions;

class PreprocessingDatabase {
public:
std::unordered_map<std::string_view, std::unique_ptr<CompressSequence>>
compress_nucleotide_functions;
std::unordered_map<std::string_view, std::unique_ptr<CompressSequence>>
compress_amino_acid_functions;
std::vector<std::unique_ptr<CompressSequence>> compress_nucleotide_functions;
std::vector<std::unique_ptr<CompressSequence>> compress_amino_acid_functions;

private:
duckdb::DuckDB duck_db;
Expand Down
50 changes: 27 additions & 23 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "silo/common/table_reader.h"
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/preprocessing/identifiers.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"
Expand All @@ -22,30 +23,35 @@ class ValidatedNdjsonFile;
class Preprocessor {
config::PreprocessingConfig preprocessing_config;
config::DatabaseConfig database_config;
ReferenceGenomes reference_genomes;
PangoLineageAliasLookup alias_lookup;

PreprocessingDatabase preprocessing_db;
ReferenceGenomes reference_genomes_;
PangoLineageAliasLookup alias_lookup_;

std::vector<std::string> nuc_sequences;
std::vector<std::string> aa_sequences;
std::vector<std::string> order_by_fields;
std::vector<std::string> prefixed_order_by_fields;
std::vector<std::string> prefixed_nuc_sequences;
std::vector<std::string> prefixed_aa_sequences;
std::vector<std::string> prefixed_nuc_insertions_fields;
std::vector<std::string> prefixed_aa_insertions_fields;

Identifiers nuc_sequence_identifiers_without_prefix;
Identifiers aa_sequence_identifiers_without_prefix;
Identifiers nuc_sequence_identifiers;
Identifiers aa_sequence_identifiers;
Identifiers unaligned_nuc_sequences;
Identifiers order_by_fields_without_prefix;
Identifiers order_by_fields;
Identifiers nuc_insertions_fields;
Identifiers aa_insertions_fields;

public:
Preprocessor(
config::PreprocessingConfig preprocessing_config,
config::DatabaseConfig database_config,
const ReferenceGenomes& reference_genomes,
ReferenceGenomes reference_genomes,
PangoLineageAliasLookup alias_lookup
);

Database preprocess();

private:
template <typename SymbolType>
Identifiers getSequenceIdentifiers();

void finalizeConfig();
void validateConfig();

Expand All @@ -59,27 +65,25 @@ class Preprocessor {
void buildPartitioningTableByColumn(const std::string& partition_by_field);
void buildEmptyPartitioning();

void createInsertionsTableFromFile(
const std::vector<std::string>& expected_sequences,
const std::filesystem::path& insertion_file,
const std::string& table_name
);
template <typename SymbolType>
Identifiers getInsertionsFields();

template <typename SymbolType>
void createInsertionsTableFromFile(const std::filesystem::path& insertion_file);

void createPartitionedSequenceTablesFromNdjson(const ValidatedNdjsonFile& input_file);

void createAlignedPartitionedSequenceViews(const ValidatedNdjsonFile& input_file);
void createUnalignedPartitionedSequenceFiles(const ValidatedNdjsonFile& input_file);
void createUnalignedPartitionedSequenceFile(
const std::string& seq_name,
const std::string& table_sql
);
void createUnalignedPartitionedSequenceFile(size_t sequence_idx, const std::string& table_sql);

void createPartitionedSequenceTablesFromSequenceFiles();

template <typename SymbolType>
void createPartitionedTableForSequence(
const std::string& sequence_name,
const std::string& reference_sequence,
size_t sequence_idx,
const Identifier& prefixed_sequence_identifier,
const std::string& compression_dictionary,
const std::filesystem::path& filename
);

Expand Down
20 changes: 0 additions & 20 deletions include/silo/preprocessing/sequence_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,6 @@ class PreprocessingDatabase;

class SequenceInfo {
public:
[[nodiscard]] static std::vector<std::string> getAlignedSequenceSelects(
const silo::ReferenceGenomes& reference_genomes,
const PreprocessingDatabase& preprocessing_db
);

[[nodiscard]] static std::string getNucleotideSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

[[nodiscard]] static std::string getUnalignedSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

[[nodiscard]] static std::string getAminoAcidSequenceSelect(
std::string_view seq_name,
const PreprocessingDatabase& preprocessing_db
);

static void validateNdjsonFile(
const silo::ReferenceGenomes& reference_genomes,
const std::filesystem::path& input_filename
Expand Down
11 changes: 4 additions & 7 deletions include/silo/preprocessing/sql_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <oneapi/tbb/enumerable_thread_specific.h>
#include <duckdb.hpp>

#include "silo/preprocessing/identifier.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/zstd/zstd_compressor.h"

Expand All @@ -17,24 +18,20 @@ class ZstdCompressor;

class CustomSqlFunction {
public:
explicit CustomSqlFunction(std::string function_name);
explicit CustomSqlFunction(preprocessing::Identifier function_name_);

virtual void addToConnection(duckdb::Connection& connection) = 0;

protected:
std::string function_name;
preprocessing::Identifier function_name;
};

class CompressSequence : public CustomSqlFunction {
std::shared_ptr<silo::ZstdCDictionary> zstd_dictionary;
tbb::enumerable_thread_specific<silo::ZstdCompressor> compressor;

public:
CompressSequence(
std::string_view symbol_type_name,
std::string_view sequence_name,
std::string_view reference
);
CompressSequence(preprocessing::Identifier function_name, std::string_view reference);

void addToConnection(duckdb::Connection& connection) override;

Expand Down
18 changes: 10 additions & 8 deletions include/silo/storage/reference_genomes.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,29 @@
namespace silo {

struct ReferenceGenomes {
std::map<std::string, std::vector<Nucleotide::Symbol>> nucleotide_sequences;
std::map<std::string, std::vector<AminoAcid::Symbol>> aa_sequences;
std::map<std::string, std::string> raw_nucleotide_sequences;
std::map<std::string, std::string> raw_aa_sequences;
std::vector<std::string> nucleotide_sequence_names;
std::vector<std::string> aa_sequence_names;
std::vector<std::vector<Nucleotide::Symbol>> nucleotide_sequences;
std::vector<std::vector<AminoAcid::Symbol>> aa_sequences;
std::vector<std::string> raw_nucleotide_sequences;
std::vector<std::string> raw_aa_sequences;

ReferenceGenomes() = default;

explicit ReferenceGenomes(
std::map<std::string, std::string>&& raw_nucleotide_sequences_,
std::map<std::string, std::string>&& raw_aa_sequences_
const std::vector<std::pair<std::string, std::string>>& nucleotide_sequences_,
const std::vector<std::pair<std::string, std::string>>& aa_sequences_
);

void writeToFile(const std::filesystem::path& reference_genomes_path) const;

static ReferenceGenomes readFromFile(const std::filesystem::path& reference_genomes_path);

template <typename SymbolType>
std::vector<std::string> getSequenceNames() const;
const std::vector<std::string>& getSequenceNames() const;

template <typename SymbolType>
std::map<std::string, std::string> getRawSequenceMap() const;
const std::vector<std::string>& getRawSequences() const;

template <typename SymbolType>
static std::vector<typename SymbolType::Symbol> stringToVector(const std::string& string);
Expand Down
2 changes: 1 addition & 1 deletion src/main.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ int main(int argc, char* argv[]) {
SPDLOG_ERROR(e.what());
return 1;
}
spdlog::set_level(spdlog::level::info);
spdlog::set_level(spdlog::level::debug);
spdlog::null_logger_mt(silo::PERFORMANCE_LOGGER_NAME);
::testing::InitGoogleMock(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
Loading

0 comments on commit 901fc7e

Please sign in to comment.