Skip to content

Commit

Permalink
feat: introduce storage of unaligned sequences from either ndjson fil…
Browse files Browse the repository at this point in the history
…e or fasta file and make them queryable via the Fasta action
  • Loading branch information
Taepper committed Jan 24, 2024
1 parent 8fdd64a commit 44df849
Show file tree
Hide file tree
Showing 38 changed files with 1,115 additions and 194 deletions.
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,7 @@ Apart from that, there are default values if neither user-provided nor default c
The user-provided preprocessing config can be used to overwrite the default values. For a full reference, see

* [testBaseData/test_preprocessing_config_with_overridden_defaults.yaml](https://github.com/GenSpectrum/LAPIS-SILO/blob/main/testBaseData/test_preprocessing_config_with_overridden_defaults.yaml)
*

or [include/silo/preprocessing/preprocessing_config_reader.h](https://github.com/GenSpectrum/LAPIS-SILO/blob/main/include/silo/preprocessing/preprocessing_config_reader.h)
* or [include/silo/preprocessing/preprocessing_config_reader.h](https://github.com/GenSpectrum/LAPIS-SILO/blob/main/include/silo/preprocessing/preprocessing_config_reader.h)

### Run docker container (api)

Expand Down
44 changes: 44 additions & 0 deletions endToEndTests/test/queries/fasta_manySequences.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"testCaseName": "Get the unaligned fasta for many sequences",
"query": {
"action": {
"type": "Fasta",
"sequenceName": "testSecondSequence"
},
"filterExpression": {
"type": "StringEquals",
"column": "division",
"value": "Vaud"
}
},
"expectedQueryResult": [
{
"gisaid_epi_isl": "EPI_ISL_3259931",
"testSecondSequence": "ACGT"
},
{
"gisaid_epi_isl": "EPI_ISL_3465732",
"testSecondSequence": "ACGT"
},
{
"gisaid_epi_isl": "EPI_ISL_2367431",
"testSecondSequence": "NCGT"
},
{
"gisaid_epi_isl": "EPI_ISL_2359636",
"testSecondSequence": "ACGT"
},
{
"gisaid_epi_isl": "EPI_ISL_1597890",
"testSecondSequence": "ACGT"
},
{
"gisaid_epi_isl": "EPI_ISL_2405276",
"testSecondSequence": "ACGT"
},
{
"gisaid_epi_isl": "EPI_ISL_1001493",
"testSecondSequence": "ACGT"
}
]
}
21 changes: 21 additions & 0 deletions endToEndTests/test/queries/fasta_oneRowTwoUnalignedSequences.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"testCaseName": "Get two unaligned fastas for one row",
"query": {
"action": {
"type": "Fasta",
"sequenceName": ["main", "testSecondSequence"]
},
"filterExpression": {
"type": "StringEquals",
"column": "gisaid_epi_isl",
"value": "EPI_ISL_1749899"
}
},
"expectedQueryResult": [
{
"gisaid_epi_isl": "EPI_ISL_1749899",
"main": "some_very_short_string",
"testSecondSequence": "AAGN"
}
]
}
20 changes: 20 additions & 0 deletions endToEndTests/test/queries/fasta_oneSequenceUnaligned.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"testCaseName": "Get the unaligned fasta for one sequence",
"query": {
"action": {
"type": "Fasta",
"sequenceName": "main"
},
"filterExpression": {
"type": "StringEquals",
"column": "gisaid_epi_isl",
"value": "EPI_ISL_1749899"
}
},
"expectedQueryResult": [
{
"gisaid_epi_isl": "EPI_ISL_1749899",
"main": "some_very_short_string"
}
]
}
4 changes: 4 additions & 0 deletions include/silo/database.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include "silo/storage/database_partition.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/sequence_store.h"
#include "silo/storage/unaligned_sequence_store.h"

namespace silo {
class BitmapContainerSize;
Expand All @@ -38,10 +39,13 @@ class Database {
public:
silo::config::DatabaseConfig database_config;
std::vector<DatabasePartition> partitions;
std::filesystem::path intermediate_results_directory;

silo::storage::ColumnGroup columns;

std::map<std::string, SequenceStore<Nucleotide>> nuc_sequences;
std::map<std::string, SequenceStore<AminoAcid>> aa_sequences;
std::map<std::string, UnalignedSequenceStore> unaligned_nuc_sequences;

void validate() const;

Expand Down
13 changes: 13 additions & 0 deletions include/silo/preprocessing/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ struct NucleotideSequencePrefix {
};
const NucleotideSequencePrefix DEFAULT_NUCLEOTIDE_SEQUENCE_PREFIX = {"nuc_"};

struct UnalignedNucleotideSequencePrefix {
std::string prefix;
};
const UnalignedNucleotideSequencePrefix DEFAULT_UNALIGNED_NUC_SEQUENCE_PREFIX = {"unaligned_"};

struct GenePrefix {
std::string prefix;
};
Expand All @@ -71,6 +76,7 @@ class PreprocessingConfig {
friend class fmt::formatter<silo::preprocessing::PreprocessingConfig>;

std::filesystem::path input_directory;
std::filesystem::path intermediate_results_directory;
std::filesystem::path output_directory;
std::optional<std::filesystem::path> preprocessing_database_location;
std::optional<std::filesystem::path> pango_lineage_definition_file;
Expand All @@ -79,6 +85,7 @@ class PreprocessingConfig {
std::filesystem::path sequences_folder;
std::filesystem::path reference_genome_file;
std::string nucleotide_sequence_prefix;
std::string unaligned_nucleotide_sequence_prefix;
std::string gene_prefix;

public:
Expand All @@ -94,11 +101,14 @@ class PreprocessingConfig {
const PangoLineageDefinitionFilename& pango_lineage_definition_filename_,
const ReferenceGenomeFilename& reference_genome_filename_,
const NucleotideSequencePrefix& nucleotide_sequence_prefix_,
const UnalignedNucleotideSequencePrefix& unaligned_nucleotide_sequence_prefix_,
const GenePrefix& gene_prefix_
);

[[nodiscard]] std::filesystem::path getOutputDirectory() const;

[[nodiscard]] std::filesystem::path getIntermediateResultsDirectory() const;

[[nodiscard]] std::optional<std::filesystem::path> getPangoLineageDefinitionFilename() const;

[[nodiscard]] std::filesystem::path getReferenceGenomeFilename() const;
Expand All @@ -111,6 +121,9 @@ class PreprocessingConfig {

[[nodiscard]] std::filesystem::path getNucFilenameNoExtension(std::string_view nuc_name) const;

[[nodiscard]] std::filesystem::path getUnalignedNucFilenameNoExtension(std::string_view nuc_name
) const;

[[nodiscard]] std::filesystem::path getGeneFilenameNoExtension(std::string_view gene_name) const;
};

Expand Down
8 changes: 6 additions & 2 deletions include/silo/preprocessing/preprocessing_config_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ struct OptionalPreprocessingConfig {
*/
std::optional<std::filesystem::path> intermediate_results_directory;
/**
* The filename where the intermediate results will be stored
* that are not relevant for an end user.
* The location where the duckdb file for persistence of intermediate results will be stored
* might be ':memory:' for no backing storage but instead in-memory mode
*/
std::optional<std::filesystem::path> preprocessing_database_location;
/**
Expand All @@ -47,6 +47,10 @@ struct OptionalPreprocessingConfig {
* Prefix that SILO expects for nucleotide sequence files
*/
std::optional<std::string> nucleotide_sequence_prefix;
/**
* Prefix that SILO expects for nucleotide sequence files
*/
std::optional<std::string> unaligned_nucleotide_sequence_prefix;
/**
* Prefix that SILO expects for gene sequence files
*/
Expand Down
11 changes: 9 additions & 2 deletions include/silo/preprocessing/preprocessing_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
#include <memory>
#include <string>

#include "duckdb.hpp"
#include <duckdb.hpp>

namespace silo {

class ZstdFastaTable;
class ReferenceGenomes;

namespace preprocessing {
Expand All @@ -33,7 +34,13 @@ class PreprocessingDatabase {

std::unique_ptr<duckdb::MaterializedQueryResult> query(std::string sql_query);

void generateSequenceTable(
ZstdFastaTable generateSequenceTableFromFasta(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
);

ZstdFastaTable generateSequenceTableFromZstdFasta(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
Expand Down
3 changes: 2 additions & 1 deletion include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ class Preprocessor {
const preprocessing::Partitions& partition_descriptor,
const ReferenceGenomes& reference_genomes,
const std::string& order_by_clause,
const silo::PangoLineageAliasLookup& alias_key
const silo::PangoLineageAliasLookup& alias_key,
const std::filesystem::path& intermediate_results_directory
);
};
} // namespace preprocessing
Expand Down
15 changes: 14 additions & 1 deletion include/silo/query_engine/actions/fasta.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <atomic>
#include <memory>
#include <vector>

Expand All @@ -13,18 +14,30 @@ namespace query_engine {
struct OperatorResult;
} // namespace query_engine
struct Database;
struct DatabasePartition;
} // namespace silo

namespace silo::query_engine::actions {

class Fasta : public Action {
static constexpr size_t SEQUENCE_LIMIT = 10'000;

std::vector<std::string> sequence_names;

[[nodiscard]] void validateOrderByFields(const Database& database) const override;

QueryResult execute(const Database& database, std::vector<OperatorResult> bitmap_filter)
const override;

void addSequencesToResultsForPartition(
QueryResult& results,
const silo::DatabasePartition& database_partition,
const OperatorResult& bitmap,
const std::string& primary_key_column
) const;

public:
explicit Fasta();
explicit Fasta(std::vector<std::string>&& sequence_names);
};

// NOLINTNEXTLINE(readability-identifier-naming)
Expand Down
9 changes: 6 additions & 3 deletions include/silo/storage/database_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,13 @@ class InsertionColumnPartition;
} // namespace storage
template <typename SymbolType>
class SequenceStorePartition;
class UnalignedSequenceStorePartition;
} // namespace silo

namespace silo {

class DatabasePartition {
friend class boost::serialization::
access; // here because serialize is private member
// (https://www.boost.org/doc/libs/1_34_0/libs/serialization/doc/serialization.html)
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) {
Expand All @@ -60,6 +59,9 @@ class DatabasePartition {
for(auto& [name, store] : aa_sequences){
archive & store;
}
for(auto& [name, store] : unaligned_nuc_sequences){
archive & store;
}
archive & sequence_count;
// clang-format on
}
Expand All @@ -78,6 +80,7 @@ class DatabasePartition {
public:
storage::ColumnPartitionGroup columns;
std::map<std::string, SequenceStorePartition<Nucleotide>&> nuc_sequences;
std::map<std::string, UnalignedSequenceStorePartition&> unaligned_nuc_sequences;
std::map<std::string, SequenceStorePartition<AminoAcid>&> aa_sequences;
uint32_t sequence_count = 0;

Expand Down
6 changes: 6 additions & 0 deletions include/silo/storage/reference_genomes.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ struct ReferenceGenomes {
void writeToFile(const std::filesystem::path& reference_genomes_path) const;

static ReferenceGenomes readFromFile(const std::filesystem::path& reference_genomes_path);

template <typename SymbolType>
static std::vector<typename SymbolType::Symbol> stringToVector(const std::string& string);

template <typename SymbolType>
static std::string vectorToString(const std::vector<typename SymbolType::Symbol>& vector);
};

} // namespace silo
52 changes: 52 additions & 0 deletions include/silo/storage/unaligned_sequence_store.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#pragma once

#include <cstdint>
#include <deque>
#include <filesystem>
#include <string>

namespace boost::serialization {
class access;
} // namespace boost::serialization

namespace silo {
class ZstdFastaTableReader;

class UnalignedSequenceStorePartition {
friend class boost::serialization::access;

template <class Archive>
void serialize(Archive& archive, [[maybe_unused]] const uint32_t version) {
archive & sequence_count;
}

public:
explicit UnalignedSequenceStorePartition(
std::filesystem::path file_name,
std::string& compression_dictionary
);

std::filesystem::path file_name;
std::string& compression_dictionary;
uint32_t sequence_count = 0;

size_t fill(silo::ZstdFastaTableReader& input);
};

class UnalignedSequenceStore {
public:
std::deque<UnalignedSequenceStorePartition> partitions;
std::filesystem::path folder_path;
std::string compression_dictionary;

void saveFolder(const std::filesystem::path& save_location) const;

explicit UnalignedSequenceStore(
std::filesystem::path folder_path,
std::string&& compression_dictionary
);

UnalignedSequenceStorePartition& createPartition();
};

} // namespace silo
4 changes: 2 additions & 2 deletions include/silo/zstdfasta/zstd_decompressor.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ class ZstdDecompressor {

explicit ZstdDecompressor(std::string_view dictionary_string);

void decompress(const std::string& input, std::string& output);
size_t decompress(const std::string& input, std::string& output);

void decompress(
size_t decompress(
const char* input_data,
size_t input_length,
char* output_data,
Expand Down
3 changes: 2 additions & 1 deletion include/silo/zstdfasta/zstdfasta_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
#include <string>
#include <string_view>

#include "silo/zstdfasta/zstd_decompressor.h"

namespace silo {
struct ZstdDecompressor;

class ZstdFastaReader {
private:
Expand Down
Loading

0 comments on commit 44df849

Please sign in to comment.