diff --git a/include/silo/common/fasta_reader.h b/include/silo/common/fasta_reader.h deleted file mode 100644 index cb626c8c0..000000000 --- a/include/silo/common/fasta_reader.h +++ /dev/null @@ -1,25 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "silo/common/input_stream_wrapper.h" - -namespace silo { -class FastaReader { - private: - silo::InputStreamWrapper in_file; - - std::optional nextKey(); - - public: - explicit FastaReader(const std::filesystem::path& in_file_name); - - std::optional nextSkipGenome(); - - std::optional next(std::string& genome_buffer); - - void reset(); -}; -} // namespace silo diff --git a/include/silo/common/input_stream_wrapper.h b/include/silo/common/input_stream_wrapper.h index 94d610e38..95b0dcab4 100644 --- a/include/silo/common/input_stream_wrapper.h +++ b/include/silo/common/input_stream_wrapper.h @@ -3,17 +3,20 @@ #include #include #include +#include #include namespace silo { class InputStreamWrapper { private: - std::ifstream file; + std::ifstream file_stream; + std::istringstream string_stream; std::unique_ptr input_stream; public: explicit InputStreamWrapper(const std::filesystem::path& filename); + explicit InputStreamWrapper(const std::string& content); [[nodiscard]] std::istream& getInputStream() const; }; diff --git a/include/silo/preprocessing/preprocessing_database.h b/include/silo/preprocessing/preprocessing_database.h index 3ca8677e1..c96c3d9b9 100644 --- a/include/silo/preprocessing/preprocessing_database.h +++ b/include/silo/preprocessing/preprocessing_database.h @@ -9,7 +9,7 @@ namespace silo { -class ZstdFastaTable; +class ZstdTable; class ReferenceGenomes; class CompressSequence; @@ -42,16 +42,22 @@ class PreprocessingDatabase { std::unique_ptr query(std::string sql_query); - ZstdFastaTable generateSequenceTableFromFasta( + ZstdTable generateSequenceTableViaFile( const std::string& table_name, const std::string& reference_sequence, - const std::string& filename + const std::filesystem::path& file_path ); - ZstdFastaTable generateSequenceTableFromZstdFasta( + ZstdTable generateSequenceTableFromFasta( const std::string& table_name, const std::string& reference_sequence, - const std::string& filename + const std::filesystem::path& file_name + ); + + ZstdTable generateSequenceTableFromSAM( + const std::string& table_name, + const std::string& reference_sequence, + const std::filesystem::path& file_name ); }; diff --git a/include/silo/preprocessing/preprocessor.h b/include/silo/preprocessing/preprocessor.h index 446bb6666..be36730fd 100644 --- a/include/silo/preprocessing/preprocessor.h +++ b/include/silo/preprocessing/preprocessor.h @@ -2,11 +2,14 @@ #include +#include "silo/common/table_reader.h" #include "silo/config/database_config.h" #include "silo/config/preprocessing_config.h" #include "silo/preprocessing/preprocessing_database.h" #include "silo/storage/pango_lineage_alias.h" #include "silo/storage/reference_genomes.h" +#include "silo/storage/sequence_store.h" +#include "silo/zstd/zstd_decompressor.h" namespace silo { class Database; @@ -91,6 +94,18 @@ class Preprocessor { const std::string& order_by_clause ); + template + ColumnFunction createRawReadLambda( + ZstdDecompressor& decompressor, + silo::SequenceStorePartition& sequence_store + ); + + template + ColumnFunction createInsertionLambda( + const std::string& sequence_name, + silo::SequenceStorePartition& sequence_store + ); + template void buildSequenceStore( Database& database, diff --git a/include/silo/preprocessing/sql_function.h b/include/silo/preprocessing/sql_function.h index 5c657dc82..a7fd5b6df 100644 --- a/include/silo/preprocessing/sql_function.h +++ b/include/silo/preprocessing/sql_function.h @@ -9,7 +9,7 @@ #include #include "silo/storage/pango_lineage_alias.h" -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" namespace silo { diff --git a/include/silo/common/fasta_format_exception.h b/include/silo/sequence_file_reader/fasta_format_exception.h similarity index 57% rename from include/silo/common/fasta_format_exception.h rename to include/silo/sequence_file_reader/fasta_format_exception.h index c7b7afb80..980792f3c 100644 --- a/include/silo/common/fasta_format_exception.h +++ b/include/silo/sequence_file_reader/fasta_format_exception.h @@ -3,11 +3,12 @@ #include #include -namespace silo { +namespace silo::sequence_file_reader { class FastaFormatException : public std::runtime_error { public: - explicit FastaFormatException(const std::string& error_message); + explicit FastaFormatException(const std::string& error_message) + : std::runtime_error(error_message.c_str()){}; }; -} // namespace silo +} // namespace silo::sequence_file_reader diff --git a/include/silo/sequence_file_reader/fasta_reader.h b/include/silo/sequence_file_reader/fasta_reader.h new file mode 100644 index 000000000..c718dd00f --- /dev/null +++ b/include/silo/sequence_file_reader/fasta_reader.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include + +#include "silo/sequence_file_reader/sequence_file_reader.h" + +namespace silo::sequence_file_reader { + +class FastaReader : public SequenceFileReader { + public: + explicit FastaReader(const std::filesystem::path& in_file_name) + : SequenceFileReader(in_file_name) {} + std::optional nextEntry() override; +}; +} // namespace silo::sequence_file_reader \ No newline at end of file diff --git a/include/silo/sequence_file_reader/sam_format_exception.h b/include/silo/sequence_file_reader/sam_format_exception.h new file mode 100644 index 000000000..55c98ff6a --- /dev/null +++ b/include/silo/sequence_file_reader/sam_format_exception.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +namespace silo::sequence_file_reader { + +class SamFormatException : public std::runtime_error { + public: + explicit SamFormatException(const std::string& error_message) + : std::runtime_error(error_message.c_str()){}; +}; + +} // namespace silo::sequence_file_reader diff --git a/include/silo/sequence_file_reader/sam_reader.h b/include/silo/sequence_file_reader/sam_reader.h new file mode 100644 index 000000000..05fafafa2 --- /dev/null +++ b/include/silo/sequence_file_reader/sam_reader.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include + +#include "silo/sequence_file_reader/sequence_file_reader.h" + +namespace silo::sequence_file_reader { + +class SamReader : public SequenceFileReader { + public: + explicit SamReader(const std::filesystem::path& in_file_name) + : SequenceFileReader(in_file_name) {} + explicit SamReader(const std::string& file_content) + : SequenceFileReader(file_content) {} + std::optional nextEntry() override; +}; +} // namespace silo::sequence_file_reader diff --git a/include/silo/sequence_file_reader/sequence_file_reader.h b/include/silo/sequence_file_reader/sequence_file_reader.h new file mode 100644 index 000000000..908c64de6 --- /dev/null +++ b/include/silo/sequence_file_reader/sequence_file_reader.h @@ -0,0 +1,29 @@ +#pragma once + +#include +#include + +#include "silo/common/input_stream_wrapper.h" + +namespace silo::sequence_file_reader { +class SequenceFileReader { + protected: + explicit SequenceFileReader(const std::filesystem::path& in_file_name) + : in_file(in_file_name){}; + explicit SequenceFileReader(const std::string& file_content) + : in_file(file_content){}; + + silo::InputStreamWrapper in_file; + + public: + struct ReadSequence { + std::string key; + uint32_t offset; + std::string sequence; + }; + + virtual std::optional nextEntry() = 0; + + virtual ~SequenceFileReader(){}; +}; +} // namespace silo::sequence_file_reader diff --git a/include/silo/storage/sequence_store.h b/include/silo/storage/sequence_store.h index bb091fbbf..075d01de0 100644 --- a/include/silo/storage/sequence_store.h +++ b/include/silo/storage/sequence_store.h @@ -9,12 +9,14 @@ #include #include +#include #include #include "silo/common/aa_symbols.h" #include "silo/common/format_number.h" #include "silo/common/nucleotide_symbols.h" #include "silo/common/symbol_map.h" +#include "silo/common/table_reader.h" #include "silo/storage/insertion_index.h" #include "silo/storage/position.h" @@ -25,7 +27,7 @@ class access; namespace silo { template class Position; -class ZstdFastaTableReader; +class ZstdTableReader; struct SequenceStoreInfo { uint32_t sequence_count; @@ -33,6 +35,19 @@ struct SequenceStoreInfo { size_t n_bitmaps_size; }; +struct ReadSequence { + bool is_valid = false; + std::string sequence = ""; + uint32_t offset; + + ReadSequence(std::string_view _sequence, uint32_t _offset = 0) + : sequence(std::move(_sequence)), + offset(_offset), + is_valid(true) {} + + ReadSequence() {} +}; + template class SequenceStorePartition { friend class boost::serialization::access; @@ -60,7 +75,10 @@ class SequenceStorePartition { uint32_t sequence_count = 0; private: - void fillIndexes(const std::vector>& genomes); + static constexpr size_t BUFFER_SIZE = 1024; + std::vector lazy_buffer; + + void fillIndexes(const std::vector& reads); void addSymbolsToPositions( size_t position_idx, @@ -68,10 +86,12 @@ class SequenceStorePartition { size_t number_of_sequences ); - void fillNBitmaps(const std::vector>& genomes); + void fillNBitmaps(const std::vector& reads); void optimizeBitmaps(); + void flushBuffer(const std::vector& reads); + public: explicit SequenceStorePartition( const std::vector& reference_sequence @@ -86,12 +106,10 @@ class SequenceStorePartition { [[nodiscard]] SequenceStoreInfo getInfo() const; - size_t fill(silo::ZstdFastaTableReader& input); + ReadSequence& appendNewSequenceRead(); void insertInsertion(size_t row_id, const std::string& insertion_and_position); - void interpret(const std::vector>& genomes); - void finalize(); }; diff --git a/include/silo/storage/unaligned_sequence_store.h b/include/silo/storage/unaligned_sequence_store.h index 8aa1e568d..4b724032d 100644 --- a/include/silo/storage/unaligned_sequence_store.h +++ b/include/silo/storage/unaligned_sequence_store.h @@ -10,7 +10,7 @@ class access; } // namespace boost::serialization namespace silo { -class ZstdFastaTableReader; +class ZstdTableReader; /// Holds information where to read unaligned sequences for a /// segment (= the sequence of a particular name) in one partition. diff --git a/include/silo/zstdfasta/zstd_compressor.h b/include/silo/zstd/zstd_compressor.h similarity index 85% rename from include/silo/zstdfasta/zstd_compressor.h rename to include/silo/zstd/zstd_compressor.h index 00ab1972c..edebb3f47 100644 --- a/include/silo/zstdfasta/zstd_compressor.h +++ b/include/silo/zstd/zstd_compressor.h @@ -7,8 +7,8 @@ #include -#include "silo/zstdfasta/zstd_context.h" -#include "silo/zstdfasta/zstd_dictionary.h" +#include "silo/zstd/zstd_context.h" +#include "silo/zstd/zstd_dictionary.h" namespace silo { diff --git a/include/silo/zstdfasta/zstd_context.h b/include/silo/zstd/zstd_context.h similarity index 100% rename from include/silo/zstdfasta/zstd_context.h rename to include/silo/zstd/zstd_context.h diff --git a/include/silo/zstdfasta/zstd_decompressor.h b/include/silo/zstd/zstd_decompressor.h similarity index 68% rename from include/silo/zstdfasta/zstd_decompressor.h rename to include/silo/zstd/zstd_decompressor.h index b85d4d1a4..b7b7982e2 100644 --- a/include/silo/zstdfasta/zstd_decompressor.h +++ b/include/silo/zstd/zstd_decompressor.h @@ -13,14 +13,13 @@ namespace silo { class ZstdDecompressor { ZstdDDictionary zstd_dictionary; ZstdDContext zstd_context; - std::string buffer; public: explicit ZstdDecompressor(std::string_view dictionary_string); - std::string_view decompress(const std::string& input); + void decompress(const std::string& input, std::string& buffer); - std::string_view decompress(const char* input_data, size_t input_length); + void decompress(const char* input_data, size_t input_length, std::string& buffer); }; } // namespace silo diff --git a/include/silo/zstdfasta/zstd_dictionary.h b/include/silo/zstd/zstd_dictionary.h similarity index 100% rename from include/silo/zstdfasta/zstd_dictionary.h rename to include/silo/zstd/zstd_dictionary.h diff --git a/include/silo/zstd/zstd_table.h b/include/silo/zstd/zstd_table.h new file mode 100644 index 000000000..9cd54e1a9 --- /dev/null +++ b/include/silo/zstd/zstd_table.h @@ -0,0 +1,30 @@ +#pragma once + +#include + +#include "silo/sequence_file_reader/sequence_file_reader.h" + +namespace duckdb { +struct Connection; +} + +namespace silo { + +class ZstdTable { + duckdb::Connection& connection; + std::string table_name; + + ZstdTable(duckdb::Connection& connection, std::string table_name) + : connection(connection), + table_name(std::move(table_name)){}; + + public: + static ZstdTable generate( + duckdb::Connection& connection, + const std::string& table_name, + sequence_file_reader::SequenceFileReader& file_reader, + std::string_view reference_sequence + ); +}; + +} // namespace silo diff --git a/include/silo/zstdfasta/zstdfasta_table_reader.h b/include/silo/zstd/zstd_table_reader.h similarity index 90% rename from include/silo/zstdfasta/zstdfasta_table_reader.h rename to include/silo/zstd/zstd_table_reader.h index 85e651bc8..c9d0f3f22 100644 --- a/include/silo/zstdfasta/zstdfasta_table_reader.h +++ b/include/silo/zstd/zstd_table_reader.h @@ -7,7 +7,7 @@ #include #include -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" namespace duckdb { struct Connection; @@ -18,7 +18,7 @@ struct DataChunk; namespace silo { struct ZstdDecompressor; -class ZstdFastaTableReader { +class ZstdTableReader { private: duckdb::Connection& connection; std::string table_name; @@ -37,7 +37,7 @@ class ZstdFastaTableReader { void advanceRow(); public: - explicit ZstdFastaTableReader( + explicit ZstdTableReader( duckdb::Connection& connection, std::string_view table_name, std::string_view compression_dict, @@ -54,4 +54,4 @@ class ZstdFastaTableReader { void loadTable(); }; -} // namespace silo +} // namespace silo \ No newline at end of file diff --git a/include/silo/zstdfasta/zstdfasta_reader.h b/include/silo/zstdfasta/zstdfasta_reader.h deleted file mode 100644 index 4c29d4fdf..000000000 --- a/include/silo/zstdfasta/zstdfasta_reader.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "silo/zstdfasta/zstd_decompressor.h" - -namespace silo { - -class ZstdFastaReader { - private: - std::ifstream in_file; - std::unique_ptr decompressor; - - std::optional nextKey(); - - public: - explicit ZstdFastaReader( - const std::filesystem::path& in_file_name, - std::string_view compression_dict - ); - - std::optional nextSkipGenome(); - - std::optional next(std::string& genome); - - std::optional nextCompressed(std::string& compressed_genome); - - void reset(); -}; -} // namespace silo diff --git a/include/silo/zstdfasta/zstdfasta_table.h b/include/silo/zstdfasta/zstdfasta_table.h deleted file mode 100644 index 4df546cfb..000000000 --- a/include/silo/zstdfasta/zstdfasta_table.h +++ /dev/null @@ -1,43 +0,0 @@ -#pragma once - -#include - -namespace duckdb { -struct Connection; -} - -namespace silo { -class ZstdFastaReader; -class ZstdFastaTableReader; -class FastaReader; - -class ZstdFastaTable { - duckdb::Connection& connection; - std::string table_name; - std::string_view compression_dict; - - ZstdFastaTable( - duckdb::Connection& connection, - std::string table_name, - std::string_view compression_dict - ); - - public: - ZstdFastaTableReader getReader(std::string_view where_clause, std::string_view order_by_clause); - - static ZstdFastaTable generate( - duckdb::Connection& connection, - const std::string& table_name, - ZstdFastaReader& file_reader, - std::string_view reference_sequence - ); - - static ZstdFastaTable generate( - duckdb::Connection& connection, - const std::string& table_name, - FastaReader& file_reader, - std::string_view reference_sequence - ); -}; - -} // namespace silo diff --git a/include/silo/zstdfasta/zstdfasta_writer.h b/include/silo/zstdfasta/zstdfasta_writer.h deleted file mode 100644 index 4776feb29..000000000 --- a/include/silo/zstdfasta/zstdfasta_writer.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include "zstd_compressor.h" - -namespace silo { -class ZstdFastaWriter { - private: - std::ofstream outStream; - std::unique_ptr compressor; - std::optional default_sequence; - - public: - explicit ZstdFastaWriter( - const std::filesystem::path& out_file_name, - std::string_view compression_dict - ); - - explicit ZstdFastaWriter( - const std::filesystem::path& out_file_name, - std::string_view compression_dict, - const std::string& default_sequence_ - ); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void write(const std::string& key, const std::string& genome); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void writeRaw(const std::string& key, const std::string& compressed_genome); - - // NOLINTNEXTLINE(bugprone-easily-swappable-parameters) - void writeRaw(const std::string& key, std::string_view compressed_genome); - - void writeDefault(const std::string& key); -}; -} // namespace silo diff --git a/src/silo/common/aa_symbols.cpp b/src/silo/common/aa_symbols.cpp index 30dc0a423..c2c9b1a93 100644 --- a/src/silo/common/aa_symbols.cpp +++ b/src/silo/common/aa_symbols.cpp @@ -2,7 +2,7 @@ #include -using silo::AminoAcid; +namespace silo { char AminoAcid::symbolToChar(AminoAcid::Symbol symbol) { switch (symbol) { @@ -170,3 +170,4 @@ const silo::SymbolMap> AminoAcid::AMBI {AminoAcid::Symbol::STOP, AminoAcid::Symbol::X}, {AminoAcid::Symbol::X}, }}}; +} // namespace silo \ No newline at end of file diff --git a/src/silo/common/fasta_format_exception.cpp b/src/silo/common/fasta_format_exception.cpp deleted file mode 100644 index 6f25754ae..000000000 --- a/src/silo/common/fasta_format_exception.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "silo/common/fasta_format_exception.h" - -#include -#include - -namespace silo { - -FastaFormatException::FastaFormatException(const std::string& error_message) - : std::runtime_error(error_message.c_str()) {} - -} // namespace silo \ No newline at end of file diff --git a/src/silo/common/fasta_reader.cpp b/src/silo/common/fasta_reader.cpp deleted file mode 100644 index f7e1e71cc..000000000 --- a/src/silo/common/fasta_reader.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include "silo/common/fasta_reader.h" - -#include -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/common/input_stream_wrapper.h" - -silo::FastaReader::FastaReader(const std::filesystem::path& in_file_name) - : in_file(in_file_name) {} - -std::optional silo::FastaReader::nextKey() { - std::string key_with_prefix; - if (!getline(in_file.getInputStream(), key_with_prefix)) { - return std::nullopt; - } - - if (key_with_prefix.empty() || key_with_prefix.at(0) != '>') { - throw FastaFormatException("Fasta key prefix '>' missing for key: " + key_with_prefix); - } - - return key_with_prefix.substr(1); -} - -std::optional silo::FastaReader::nextSkipGenome() { - auto key = nextKey(); - - in_file.getInputStream().ignore(LONG_MAX, '\n'); - return key; -} - -std::optional silo::FastaReader::next(std::string& genome_buffer) { - auto key = nextKey(); - if (!key) { - return std::nullopt; - } - - if (!getline(in_file.getInputStream(), genome_buffer)) { - throw FastaFormatException("Missing genome sequence in line following key: " + *key); - } - - return key; -} - -void silo::FastaReader::reset() { - in_file.getInputStream().clear(); // clear fail and eof bits - in_file.getInputStream().seekg(0, std::ios::beg); // g pointer back to the start -} diff --git a/src/silo/common/fasta_reader.test.cpp b/src/silo/common/fasta_reader.test.cpp deleted file mode 100644 index f2a0506c9..000000000 --- a/src/silo/common/fasta_reader.test.cpp +++ /dev/null @@ -1,88 +0,0 @@ -#include -#include -#include - -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/common/fasta_reader.h" -#include "silo/config/preprocessing_config.h" - -TEST(FastaReader, shouldReadFastaFile) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.fasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::FastaReader under_test(file_path); - - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - key = under_test.next(genome); - EXPECT_FALSE(key != std::nullopt); -} - -TEST(FastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"no_end_new_line.fasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::FastaReader under_test(file_path); - - std::optional key; - std::string genome; - - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - key = under_test.next(genome); - EXPECT_FALSE(key != std::nullopt); -} - -TEST(FastaReader, givenDataInWrongFormatThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"wrong_format.fasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::FastaReader under_test(file_path); - - std::string genome; - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); -} - -TEST(FastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"missing_genome.fasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::FastaReader under_test(file_path); - - std::optional key; - std::string genome; - - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); -} \ No newline at end of file diff --git a/src/silo/common/input_stream_wrapper.cpp b/src/silo/common/input_stream_wrapper.cpp index bbc26a7f3..2e053cdb9 100644 --- a/src/silo/common/input_stream_wrapper.cpp +++ b/src/silo/common/input_stream_wrapper.cpp @@ -27,21 +27,27 @@ InputStreamWrapper::InputStreamWrapper(const std::filesystem::path& filename) : input_stream(std::make_unique()) { if (std::filesystem::is_regular_file(withZSTending(filename))) { SPDLOG_INFO("Detected file-ending .zst for input file " + filename.string()); - file = std::ifstream(withZSTending(filename), std::ios::binary); + file_stream = std::ifstream(withZSTending(filename), std::ios::binary); input_stream->push(boost::iostreams::zstd_decompressor()); } else if (std::filesystem::is_regular_file(withXZending(filename))) { SPDLOG_INFO("Detected file-ending .xz for input file " + filename.string()); - file = std::ifstream(withXZending(filename), std::ios::binary); + file_stream = std::ifstream(withXZending(filename), std::ios::binary); input_stream->push(boost::iostreams::lzma_decompressor()); } else if (std::filesystem::is_regular_file(filename)) { SPDLOG_INFO("Detected file without specialized ending, processing raw: " + filename.string()); - file = std::ifstream(filename, std::ios::binary); + file_stream = std::ifstream(filename, std::ios::binary); } else { throw silo::preprocessing::PreprocessingException( "Cannot find file with name or associated endings (.xz, .zst): " + filename.string() ); } - input_stream->push(file); + input_stream->push(file_stream); +} + +InputStreamWrapper::InputStreamWrapper(const std::string& content) { + string_stream = std::istringstream(content); + input_stream = std::make_unique(); + input_stream->push(string_stream); } std::istream& silo::InputStreamWrapper::getInputStream() const { diff --git a/src/silo/common/nucleotide_symbols.cpp b/src/silo/common/nucleotide_symbols.cpp index e5a46c9ae..1f7ff14b3 100644 --- a/src/silo/common/nucleotide_symbols.cpp +++ b/src/silo/common/nucleotide_symbols.cpp @@ -154,5 +154,4 @@ const silo::SymbolMap> Nucleotide::A {Nucleotide::Symbol::V}, {Nucleotide::Symbol::N}, }}}; - } // namespace silo \ No newline at end of file diff --git a/src/silo/database.cpp b/src/silo/database.cpp index 1c4bb1b8d..1aae836cc 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -38,7 +38,6 @@ #include "silo/common/block_timer.h" #include "silo/common/data_version.h" -#include "silo/common/fasta_reader.h" #include "silo/common/format_number.h" #include "silo/common/nucleotide_symbols.h" #include "silo/config/database_config.h" @@ -50,6 +49,7 @@ #include "silo/query_engine/query_engine.h" #include "silo/query_engine/query_result.h" #include "silo/roaring/roaring_serialize.h" +#include "silo/sequence_file_reader/fasta_reader.h" #include "silo/storage/column/date_column.h" #include "silo/storage/column/float_column.h" #include "silo/storage/column/indexed_string_column.h" @@ -63,9 +63,8 @@ #include "silo/storage/sequence_store.h" #include "silo/storage/serialize_optional.h" #include "silo/storage/unaligned_sequence_store.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_decompressor.h" +#include "silo/zstd/zstd_table.h" namespace silo { diff --git a/src/silo/file_reader/fasta_reader.cpp b/src/silo/file_reader/fasta_reader.cpp new file mode 100644 index 000000000..1267b45c4 --- /dev/null +++ b/src/silo/file_reader/fasta_reader.cpp @@ -0,0 +1,42 @@ +#include + +#include "silo/common/string_utils.h" +#include "silo/sequence_file_reader/fasta_format_exception.h" +#include "silo/sequence_file_reader/fasta_reader.h" + +namespace silo::sequence_file_reader { + +std::optional FastaReader::nextEntry() { + std::string data; + while (data.empty()) { + if (!getline(in_file.getInputStream(), data)) { + return std::nullopt; + } + } + + if (data.empty() || data.at(0) != '>') { + throw FastaFormatException("Fasta key prefix '>' missing for key: " + data); + } + + auto parts = splitBy(data.substr(1), "|"); + auto key = parts.front(); + auto fields = std::vector(parts.begin() + 1, parts.end()); + + if (key.empty()) { + throw FastaFormatException("Fasta description not valid, missing id: " + data); + } + + std::string sequence; + + if (!getline(in_file.getInputStream(), sequence)) { + throw FastaFormatException("Missing genome sequence in line following identifier: " + key); + } + + return ReadSequence{ + .key = key, + .offset = fields.empty() ? 0 : static_cast(std::stoi(fields[0])), + .sequence = sequence + }; +} + +} // namespace silo::sequence_file_reader diff --git a/src/silo/file_reader/fasta_reader.test.cpp b/src/silo/file_reader/fasta_reader.test.cpp new file mode 100644 index 000000000..36294acc3 --- /dev/null +++ b/src/silo/file_reader/fasta_reader.test.cpp @@ -0,0 +1,140 @@ +#include +#include +#include + +#include "silo/config/preprocessing_config.h" +#include "silo/sequence_file_reader/fasta_format_exception.h" +#include "silo/sequence_file_reader/fasta_reader.h" + +using silo::sequence_file_reader::FastaFormatException; +using silo::sequence_file_reader::FastaReader; + +TEST(FastaReader, shouldReadFastaFile) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"test.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); + EXPECT_EQ(key, "Key1"); + EXPECT_EQ(genome, "ACGT"); + + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + genome = entry.value().sequence; + EXPECT_EQ(key, "Key2"); + EXPECT_EQ(genome, "CGTA"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} + +TEST(FastaReader, shouldReadFastaFileWithBlankLines) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"blankLines.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); + EXPECT_EQ(key, "Key1"); + EXPECT_EQ(genome, "ACGT"); + + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + genome = entry.value().sequence; + EXPECT_EQ(key, "Key2"); + EXPECT_EQ(genome, "CGTA"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} + +TEST(FastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"no_end_new_line.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); + EXPECT_EQ(key, "Key"); + EXPECT_EQ(genome, "ACGT"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} + +TEST(FastaReader, givenDataInWrongFormatThenShouldThrowAnException) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"wrong_format.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + EXPECT_THROW(under_test.nextEntry(), FastaFormatException); +} + +TEST(FastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"missing_genome.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, _, genome] = entry.value(); + EXPECT_EQ(key, "Key"); + EXPECT_EQ(genome, "ACGT"); + + EXPECT_THROW(under_test.nextEntry(), FastaFormatException); +} + +TEST(FastaReader, shouldReadFastaFileWithOffset) { + const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; + const std::string sequence_filename{"withOffset.fasta"}; + + std::filesystem::path file_path = input_directory; + file_path += sequence_filename; + + FastaReader under_test(file_path); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, offset, genome] = entry.value(); + EXPECT_EQ(key, "Key1"); + EXPECT_EQ(offset, 10); + EXPECT_EQ(genome, "ACGT"); + + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + offset = entry.value().offset; + genome = entry.value().sequence; + EXPECT_EQ(key, "Key2"); + EXPECT_EQ(offset, 20); + EXPECT_EQ(genome, "CGTA"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} \ No newline at end of file diff --git a/src/silo/file_reader/sam_reader.cpp b/src/silo/file_reader/sam_reader.cpp new file mode 100644 index 000000000..7d6cdcfd1 --- /dev/null +++ b/src/silo/file_reader/sam_reader.cpp @@ -0,0 +1,43 @@ +#include +#include + +#include +#include "silo/sequence_file_reader/sam_format_exception.h" +#include "silo/sequence_file_reader/sam_reader.h" +#include "silo/sequence_file_reader/sequence_file_reader.h" + +namespace silo::sequence_file_reader { + +std::optional SamReader::nextEntry() { + std::string data; + if (!getline(in_file.getInputStream(), data)) { + return std::nullopt; + } + + // optional header data + while ((data.empty() || data.at(0) == '@') && getline(in_file.getInputStream(), data)) { + ; + } + + auto parts = splitBy(data, "\t"); + if (parts.size() < 12) { + throw SamFormatException( + "Sam incorrectly formatted: 11 tab separated fields are required, incorrect row: " + data + ); + } + if (parts.at(0).empty()) { + throw SamFormatException("Sam incorrectly formatted: missing key in row: " + data); + } + if (parts.at(3).empty()) { + throw SamFormatException("Sam incorrectly formatted: missing offset in row: " + data); + } + if (parts.at(9).empty()) { + throw SamFormatException("Sam incorrectly formatted: empty sequence in row: " + data); + } + return SequenceFileReader::ReadSequence{ + .key = parts.at(0), + .offset = static_cast(std::stoi(parts.at(3))), + .sequence = parts.at(9) + }; +} +} // namespace silo::sequence_file_reader \ No newline at end of file diff --git a/src/silo/file_reader/sam_reader.test.cpp b/src/silo/file_reader/sam_reader.test.cpp new file mode 100644 index 000000000..1b4230c56 --- /dev/null +++ b/src/silo/file_reader/sam_reader.test.cpp @@ -0,0 +1,93 @@ +#include +#include +#include + +#include "silo/config/preprocessing_config.h" +#include "silo/sequence_file_reader/sam_format_exception.h" +#include "silo/sequence_file_reader/sam_reader.h" + +using silo::sequence_file_reader::SamFormatException; +using silo::sequence_file_reader::SamReader; + +TEST(SamReader, shouldReadSamFile) { + const std::string file_content{ + R"( +1.1 99 NC_045512.2 10 60 5S246M = 201 404 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +1.3 99 NC_045512.2 5 60 47S204M = 209 400 TATACCTTCCCAGGTAACAAACCAACCAACTTTTTTTTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; +)" + }; + + SamReader under_test(file_content); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, offset, genome] = entry.value(); + EXPECT_EQ(key, "1.1"); + EXPECT_EQ(offset, 10); + EXPECT_EQ(genome, "TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT"); + + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + offset = entry.value().offset; + genome = entry.value().sequence; + EXPECT_EQ(key, "1.3"); + EXPECT_EQ(offset, 5); + EXPECT_EQ(genome, "TATACCTTCCCAGGTAACAAACCAACCAACTTTTTTTTT"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} + +TEST(SamReader, shouldReadSamFileWithBlankLines) { + const std::string file_content{ + R"( + +1.1 99 NC_045512.2 10 60 5S246M = 201 404 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 + + +1.3 99 NC_045512.2 5 60 47S204M = 209 400 TATACCTTCCCAGGTAACAAACCAACCAACTTTTTTTTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; +)" + }; + + SamReader under_test(file_content); + + auto entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + auto [key, offset, genome] = entry.value(); + EXPECT_EQ(key, "1.1"); + EXPECT_EQ(offset, 10); + EXPECT_EQ(genome, "TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT"); + + entry = under_test.nextEntry(); + EXPECT_TRUE(entry.has_value()); + key = entry.value().key; + offset = entry.value().offset; + genome = entry.value().sequence; + EXPECT_EQ(key, "1.3"); + EXPECT_EQ(offset, 5); + EXPECT_EQ(genome, "TATACCTTCCCAGGTAACAAACCAACCAACTTTTTTTTT"); + + entry = under_test.nextEntry(); + EXPECT_FALSE(entry.has_value()); +} + +TEST(SamReader, givenDataInWrongFormatThenShouldThrowAnException) { + const std::string file_content{"Wrong format"}; + + SamReader under_test(file_content); + + EXPECT_THROW(under_test.nextEntry(), SamFormatException); +} + +TEST(SamReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { + const std::string file_content{ + R"( +1.1 99 NC_045512.2 10 60 5S246M = 201 404 CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +)" + }; + + SamReader under_test(file_content); + + EXPECT_THROW(under_test.nextEntry(), SamFormatException); +} diff --git a/src/silo/preprocessing/preprocessing_database.cpp b/src/silo/preprocessing/preprocessing_database.cpp index 4fcc0fd5f..b58592d99 100644 --- a/src/silo/preprocessing/preprocessing_database.cpp +++ b/src/silo/preprocessing/preprocessing_database.cpp @@ -9,21 +9,26 @@ #include #include -#include "silo/common/fasta_reader.h" +#include "silo/common/string_utils.h" #include "silo/preprocessing/partition.h" #include "silo/preprocessing/preprocessing_exception.h" #include "silo/preprocessing/sql_function.h" +#include "silo/sequence_file_reader/fasta_reader.h" +#include "silo/sequence_file_reader/sam_reader.h" #include "silo/storage/reference_genomes.h" -#include "silo/zstdfasta/zstd_compressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_writer.h" +#include "silo/zstd/zstd_compressor.h" +#include "silo/zstd/zstd_table.h" using duckdb::BigIntValue; using duckdb::ListValue; using duckdb::MaterializedQueryResult; using duckdb::Value; +namespace { +constexpr std::string_view FASTA_EXTENSION = "fasta"; +constexpr std::string_view SAM_EXTENSION = "sam"; +} // namespace + namespace silo::preprocessing { PreprocessingDatabase::PreprocessingDatabase( @@ -109,22 +114,55 @@ preprocessing::Partitions PreprocessingDatabase::getPartitionDescriptor() { return preprocessing::Partitions(partitions); } -ZstdFastaTable PreprocessingDatabase::generateSequenceTableFromFasta( +ZstdTable PreprocessingDatabase::generateSequenceTableViaFile( const std::string& table_name, const std::string& reference_sequence, - const std::string& filename + const std::filesystem::path& file_path ) { - silo::FastaReader fasta_reader(filename); - return ZstdFastaTable::generate(connection, table_name, fasta_reader, reference_sequence); + const auto file_stem = file_path.stem().string(); + for (const auto& entry : std::filesystem::directory_iterator(file_path.parent_path())) { + const auto entry_file_name = entry.path().filename().string(); + if (!entry.is_regular_file() || !entry_file_name.starts_with(file_stem)) { + continue; + } + auto extensions = splitBy(entry_file_name, "."); + auto last = extensions.back(); + if (last == "zst" || last == "xz") { + extensions.pop_back(); + last = extensions.back(); + } + if (last == FASTA_EXTENSION) { + return generateSequenceTableFromFasta(table_name, reference_sequence, entry.path()); + } + if (last == SAM_EXTENSION) { + return generateSequenceTableFromSAM(table_name, reference_sequence, entry.path()); + } + } + + throw PreprocessingException(fmt::format( + "Could not find reference file for {}, tried file extensions: .fasta(.zst,.xz), " + ".sam(.zst,.xz)", + file_path.string() + )); } -ZstdFastaTable PreprocessingDatabase::generateSequenceTableFromZstdFasta( +ZstdTable PreprocessingDatabase::generateSequenceTableFromFasta( const std::string& table_name, const std::string& reference_sequence, - const std::string& filename + const std::filesystem::path& file_name ) { - silo::ZstdFastaReader zstd_fasta_reader(filename, reference_sequence); - return ZstdFastaTable::generate(connection, table_name, zstd_fasta_reader, reference_sequence); + silo::sequence_file_reader::FastaReader fasta_reader(file_name); + return ZstdTable::generate(connection, table_name, fasta_reader, reference_sequence); +} + +ZstdTable PreprocessingDatabase::generateSequenceTableFromSAM( + const std::string& table_name, + const std::string& reference_sequence, + const std::filesystem::path& file_name +) { + silo::sequence_file_reader::SamReader sam_reader(file_name); + + return ZstdTable::generate(connection, table_name, sam_reader, reference_sequence); } std::vector extractStringListValue( diff --git a/src/silo/preprocessing/preprocessor.cpp b/src/silo/preprocessing/preprocessor.cpp index 532190f22..fed3c7095 100644 --- a/src/silo/preprocessing/preprocessor.cpp +++ b/src/silo/preprocessing/preprocessor.cpp @@ -7,7 +7,6 @@ #include #include "silo/common/block_timer.h" -#include "silo/common/fasta_reader.h" #include "silo/common/string_utils.h" #include "silo/common/table_reader.h" #include "silo/config/preprocessing_config.h" @@ -19,16 +18,15 @@ #include "silo/preprocessing/sequence_info.h" #include "silo/preprocessing/sql_function.h" #include "silo/preprocessing/validated_ndjson_file.h" +#include "silo/sequence_file_reader/fasta_reader.h" #include "silo/storage/reference_genomes.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" - -namespace silo::preprocessing { - -static constexpr std::string_view FASTA_EXTENSION = ".fasta"; -static const std::string INSERTIONS_TABLE_NAME_SUFFIX = "insertions"; +#include "silo/storage/unaligned_sequence_store.h" +#include "silo/zstd/zstd_decompressor.h" +#include "silo/zstd/zstd_table.h" namespace { +const std::string FASTA_EXTENSION = ".fasta"; +const std::string INSERTIONS_TABLE_NAME_SUFFIX = "insertions"; template std::string getInsertionsTableName() { return fmt::format("{}{}", SymbolType::PREFIX, INSERTIONS_TABLE_NAME_SUFFIX); @@ -36,6 +34,8 @@ std::string getInsertionsTableName() { } // namespace +namespace silo::preprocessing { + Preprocessor::Preprocessor( config::PreprocessingConfig preprocessing_config_, config::DatabaseConfig database_config_, @@ -427,7 +427,7 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const ValidatedNdjsonFi for (const auto& prefixed_nuc_name : prefixed_nuc_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {0} AS\n" - "SELECT key, {0} AS sequence, partition_id, {1} " + "SELECT key, struct_pack(\"offset\" := 0, sequence := {0}) AS read, partition_id, {1} " "FROM sequence_table;", prefixed_nuc_name, boost::join(prefixed_order_by_fields, ",") @@ -437,7 +437,7 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const ValidatedNdjsonFi for (const auto& prefixed_aa_name : prefixed_aa_sequences) { (void)preprocessing_db.query(fmt::format( "CREATE OR REPLACE VIEW {0} AS\n" - "SELECT key, {0} AS sequence, partition_id, {1} " + "SELECT key, struct_pack(\"offset\" := 0, sequence := {0}) AS read, partition_id, {1} " "FROM sequence_table;", prefixed_aa_name, boost::join(prefixed_order_by_fields, ",") @@ -446,29 +446,38 @@ void Preprocessor::createAlignedPartitionedSequenceViews(const ValidatedNdjsonFi } void Preprocessor::createUnalignedPartitionedSequenceFiles(const ValidatedNdjsonFile& input_file) { - for (const auto& [seq_name, _] : reference_genomes_.raw_nucleotide_sequences) { + for (const auto& seq_name : nuc_sequences) { + const std::string sequence_column_name = fmt::format("unaligned_nuc_{}", seq_name); const std::string file_reader_sql = - input_file.isEmpty() - ? fmt::format( - "SELECT ''::VARCHAR AS key, 'NULL'::VARCHAR as partition_key," - " ''::VARCHAR AS unaligned_nuc_{} LIMIT 0", - seq_name - ) - : fmt::format( - "SELECT metadata.\"{0}\" AS key, {1} AS partition_key, " - " unalignedNucleotideSequences.\"{2}\" AS unaligned_nuc_{2} " - "FROM read_json_auto('{3}')", - database_config.schema.primary_key, - getPartitionKeySelect(), - seq_name, - input_file.getFileName().string() - ); + input_file.isEmpty() ? fmt::format( + "SELECT ''::VARCHAR AS key, 'NULL'::VARCHAR as partition_key," + " ''::VARCHAR AS {} LIMIT 0", + sequence_column_name + ) + : fmt::format( + "SELECT metadata.\"{0}\" AS key, {1} AS partition_key, " + " unalignedNucleotideSequences.\"{2}\" AS {3} " + "FROM read_json_auto('{4}')", + database_config.schema.primary_key, + getPartitionKeySelect(), + seq_name, + sequence_column_name, + input_file.getFileName().string() + ); const std::string table_sql = fmt::format( - "SELECT key, {}, partition_key_to_partition.partition_id \n" - "FROM ({}) file_reader " - "JOIN partition_key_to_partition " - "ON (file_reader.partition_key = partition_key_to_partition.partition_key) ", - SequenceInfo::getUnalignedSequenceSelect(seq_name, preprocessing_db), + R"( + SELECT + key, + struct_pack("offset" := 0, sequence := {0}) AS {1}, + partition_key_to_partition.partition_id + FROM ({2}) file_reader + JOIN partition_key_to_partition + ON (file_reader.partition_key = partition_key_to_partition.partition_key) + )", + preprocessing_db.compress_nucleotide_functions.at(seq_name)->generateSqlStatement( + sequence_column_name + ), + sequence_column_name, file_reader_sql ); createUnalignedPartitionedSequenceFile(seq_name, table_sql); @@ -557,16 +566,15 @@ void Preprocessor::createPartitionedSequenceTablesFromSequenceFiles() { .replace_extension(FASTA_EXTENSION) ); - preprocessing_db.generateSequenceTableFromFasta( + preprocessing_db.generateSequenceTableViaFile( "unaligned_tmp", reference_sequence, preprocessing_config.getUnalignedNucFilenameNoExtension(sequence_name) - .replace_extension(FASTA_EXTENSION) ); createUnalignedPartitionedSequenceFile( sequence_name, fmt::format( - "SELECT unaligned_tmp.key AS key, unaligned_tmp.sequence AS unaligned_nuc_{}, " + "SELECT unaligned_tmp.key AS key, unaligned_tmp.read AS unaligned_nuc_{}, " "partitioned_metadata.partition_id AS partition_id " "FROM unaligned_tmp RIGHT JOIN partitioned_metadata " "ON unaligned_tmp.key = partitioned_metadata.\"{}\" ", @@ -597,12 +605,12 @@ void Preprocessor::createPartitionedTableForSequence( fmt::format("{}{}{}", "raw_", SymbolType::PREFIX, sequence_name); const std::string table_name = fmt::format("{}{}", SymbolType::PREFIX, sequence_name); - preprocessing_db.generateSequenceTableFromFasta(raw_table_name, reference_sequence, filename); + preprocessing_db.generateSequenceTableViaFile(raw_table_name, reference_sequence, filename); (void)preprocessing_db.query(fmt::format( R"-( CREATE OR REPLACE VIEW {} AS - SELECT key, sequence, + SELECT key, read, partitioned_metadata.partition_id AS partition_id {} FROM {} AS raw RIGHT JOIN partitioned_metadata ON raw.key = partitioned_metadata."{}"; @@ -744,6 +752,33 @@ ColumnFunction createInsertionsLambda( } } // namespace +template +ColumnFunction Preprocessor::createRawReadLambda( + ZstdDecompressor& decompressor, + silo::SequenceStorePartition& sequence_store +) { + return { + "read", + [&decompressor, + &sequence_store](size_t chunk_offset, const duckdb::Vector& vector, size_t chunk_size) { + for (size_t row_in_chunk = 0; row_in_chunk < chunk_size; row_in_chunk++) { + ReadSequence& target = sequence_store.appendNewSequenceRead(); + const auto& value = vector.GetValue(row_in_chunk); + if (value.IsNull()) { + continue; + } + const auto& children = duckdb::StructValue::GetChildren(value); + if (children[1].IsNull()) { + continue; + } + decompressor.decompress(children[1].GetValueUnsafe(), target.sequence); + target.offset = children[0].GetValue(); + target.is_valid = true; + } + } + }; +} + template void Preprocessor::buildSequenceStore( Database& database, @@ -769,22 +804,28 @@ void Preprocessor::buildSequenceStore( partition_index ); - auto& sequence_store = database.partitions.at(partition_index) - .template getSequenceStores() - .at(sequence_name); + SequenceStorePartition& sequence_store = + database.partitions.at(partition_index) + .template getSequenceStores() + .at(sequence_name); + + ZstdDecompressor decompressor(reference_sequence); - silo::ZstdFastaTableReader sequence_input( + auto column_function_reads = createRawReadLambda(decompressor, sequence_store); + + silo::TableReader( preprocessing_db.getConnection(), fmt::format("{}{}", SymbolType::PREFIX, sequence_name), - reference_sequence, - "sequence", + "key", + {column_function_reads}, fmt::format("partition_id = {}", partition_index), order_by_clause - ); - sequence_store.fill(sequence_input); + ) + .read(); const silo::ColumnFunction column_function = createInsertionsLambda(sequence_name, sequence_store); + silo::TableReader( preprocessing_db.getConnection(), getInsertionsTableName(), diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp index 38f43d008..982068a9b 100644 --- a/src/silo/preprocessing/preprocessor.test.cpp +++ b/src/silo/preprocessing/preprocessor.test.cpp @@ -53,6 +53,35 @@ const Scenario FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES = { }])") }; +// First 10 characters of sequence in nuc_main.sam have been removed +// This is to test the offset (set to 10 for both reads) +const Scenario SAM_FILES = { + .input_directory = "testBaseData/samFiles/", + .expected_sequence_count = 2, + .query = R"( + { + "action": { + "type": "FastaAligned", + "sequenceName": ["main"], + "orderByFields": ["accessionVersion"] + }, + "filterExpression": { + "type": "True" + } + } + )", + .expected_query_result = nlohmann::json::parse(R"([ + { + "accessionVersion": "1.1", + "main": "NNNNNNNNNNTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT" + }, + { + "accessionVersion": "1.3", + "main": "NNNNNTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTNNNNN" + } + ])") +}; + const Scenario NDJSON_FILE_WITH_MISSING_SEGMENTS_AND_GENES = { .input_directory = "testBaseData/ndjsonWithNullSequences/", .expected_sequence_count = FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES.expected_sequence_count, @@ -236,6 +265,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( FASTA_FILES_WITH_MISSING_SEGMENTS_AND_GENES, NDJSON_FILE_WITH_MISSING_SEGMENTS_AND_GENES, + SAM_FILES, NDJSON_WITH_SQL_KEYWORD_AS_FIELD, TSV_FILE_WITH_SQL_KEYWORD_AS_FIELD, NDJSON_WITH_NUMERIC_NAMES, diff --git a/src/silo/preprocessing/sql_function.cpp b/src/silo/preprocessing/sql_function.cpp index b558a16bd..bbc665059 100644 --- a/src/silo/preprocessing/sql_function.cpp +++ b/src/silo/preprocessing/sql_function.cpp @@ -4,7 +4,7 @@ #include "silo/common/pango_lineage.h" #include "silo/storage/reference_genomes.h" -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" using duckdb::Connection; using duckdb::DataChunk; diff --git a/src/silo/query_engine/actions/fasta.cpp b/src/silo/query_engine/actions/fasta.cpp index d96645769..20462302c 100644 --- a/src/silo/query_engine/actions/fasta.cpp +++ b/src/silo/query_engine/actions/fasta.cpp @@ -13,7 +13,7 @@ #include "silo/query_engine/operator_result.h" #include "silo/query_engine/query_parse_exception.h" #include "silo/query_engine/query_result.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_table_reader.h" using silo::common::add1; using silo::common::Range; @@ -112,7 +112,7 @@ void addSequencesFromResultTableToJson( const std::string& sequence_name = sequence_names.at(sequence_idx); const std::string_view compression_dict = database_partition.unaligned_nuc_sequences.at(sequence_name).compression_dictionary; - silo::ZstdFastaTableReader table_reader( + silo::ZstdTableReader table_reader( connection, result_table_name, compression_dict, diff --git a/src/silo/storage/reference_genomes.cpp b/src/silo/storage/reference_genomes.cpp index 7f771fdb3..44ee54bc4 100644 --- a/src/silo/storage/reference_genomes.cpp +++ b/src/silo/storage/reference_genomes.cpp @@ -241,5 +241,4 @@ std::string ReferenceGenomes::vectorToString( template <> std::string ReferenceGenomes::vectorToString(const std::vector& vector ); - } // namespace silo \ No newline at end of file diff --git a/src/silo/storage/sequence_store.cpp b/src/silo/storage/sequence_store.cpp index 05ee6bd0d..7b9317186 100644 --- a/src/silo/storage/sequence_store.cpp +++ b/src/silo/storage/sequence_store.cpp @@ -15,15 +15,17 @@ #include "silo/common/nucleotide_symbols.h" #include "silo/common/string_utils.h" #include "silo/common/symbol_map.h" +#include "silo/common/table_reader.h" #include "silo/preprocessing/preprocessing_exception.h" #include "silo/storage/position.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_decompressor.h" template silo::SequenceStorePartition::SequenceStorePartition( const std::vector& reference_sequence ) : reference_sequence(reference_sequence) { + lazy_buffer.reserve(BUFFER_SIZE); positions.reserve(reference_sequence.size()); for (const auto symbol : reference_sequence) { positions.emplace_back(Position::fromInitiallyFlipped(symbol)); @@ -31,41 +33,14 @@ silo::SequenceStorePartition::SequenceStorePartition( } template -size_t silo::SequenceStorePartition::fill(ZstdFastaTableReader& input) { - static constexpr size_t BUFFER_SIZE = 1024; - - input.loadTable(); - - size_t read_sequences_count = 0; - - std::vector> genome_buffer; - - std::optional key; - std::optional genome; - while (true) { - key = input.next(genome); - if (!key) { - break; - } - genome_buffer.push_back(std::move(genome)); - if (genome_buffer.size() >= BUFFER_SIZE) { - interpret(genome_buffer); - genome_buffer.clear(); - } - - ++read_sequences_count; +silo::ReadSequence& silo::SequenceStorePartition::appendNewSequenceRead() { + if (lazy_buffer.size() > BUFFER_SIZE) { + flushBuffer(lazy_buffer); + lazy_buffer.clear(); } - interpret(genome_buffer); - const SequenceStoreInfo info_before_optimisation = getInfo(); - optimizeBitmaps(); - - SPDLOG_DEBUG( - "Sequence store partition info after filling it: {}, and after optimising: {}", - info_before_optimisation, - getInfo() - ); - return read_sequences_count; + lazy_buffer.emplace_back(); + return lazy_buffer.back(); } namespace { @@ -112,6 +87,9 @@ void silo::SequenceStorePartition::insertInsertion( template void silo::SequenceStorePartition::finalize() { + flushBuffer(lazy_buffer); + lazy_buffer.clear(); + SPDLOG_DEBUG("Building insertion index"); insertion_index.buildIndex(); @@ -159,9 +137,7 @@ const roaring::Roaring* silo::SequenceStorePartition::getBitmap( } template -void silo::SequenceStorePartition::fillIndexes( - const std::vector>& genomes -) { +void silo::SequenceStorePartition::fillIndexes(const std::vector& reads) { const size_t genome_length = positions.size(); static constexpr int COUNT_SYMBOLS_PER_PROCESSOR = 64; tbb::parallel_for( @@ -169,13 +145,13 @@ void silo::SequenceStorePartition::fillIndexes( [&](const auto& local) { SymbolMap> ids_per_symbol_for_current_position; for (size_t position_idx = local.begin(); position_idx != local.end(); ++position_idx) { - const size_t number_of_sequences = genomes.size(); + const size_t number_of_sequences = reads.size(); for (size_t sequence_id = 0; sequence_id < number_of_sequences; ++sequence_id) { - const auto& genome = genomes[sequence_id]; - if (!genome.has_value()) { + const auto& [is_valid, sequence, offset] = reads[sequence_id]; + if (!is_valid || position_idx < offset || position_idx - offset >= sequence.size()) { continue; } - const char character = genome.value()[position_idx]; + const char character = sequence[position_idx - offset]; const auto symbol = SymbolType::charToSymbol(character); if (!symbol.has_value()) { throw silo::preprocessing::PreprocessingException( @@ -211,34 +187,38 @@ void silo::SequenceStorePartition::addSymbolsToPositions( } template -void silo::SequenceStorePartition::fillNBitmaps( - const std::vector>& genomes +void silo::SequenceStorePartition::fillNBitmaps(const std::vector& reads ) { const size_t genome_length = positions.size(); - missing_symbol_bitmaps.resize(sequence_count + genomes.size()); + missing_symbol_bitmaps.resize(sequence_count + reads.size()); - const tbb::blocked_range range(0, genomes.size()); + const tbb::blocked_range range(0, reads.size()); tbb::parallel_for(range, [&](const decltype(range)& local) { std::vector positions_with_symbol_missing; for (size_t sequence_index = local.begin(); sequence_index != local.end(); ++sequence_index) { - const auto& maybe_genome = genomes[sequence_index]; + const auto& [is_valid, maybe_sequence, offset] = reads[sequence_index]; - if (!maybe_genome.has_value()) { + if (!is_valid) { missing_symbol_bitmaps[sequence_count + sequence_index].addRange(0, genome_length); missing_symbol_bitmaps[sequence_count + sequence_index].runOptimize(); continue; } - const auto& genome = maybe_genome.value(); + missing_symbol_bitmaps[sequence_count + sequence_index].addRange(0, offset); - for (size_t position_idx = 0; position_idx < genome_length; ++position_idx) { - const char character = genome[position_idx]; + for (size_t position_idx = 0; position_idx < maybe_sequence.size(); ++position_idx) { + const char character = maybe_sequence[position_idx]; const auto symbol = SymbolType::charToSymbol(character); if (symbol == SymbolType::SYMBOL_MISSING) { - positions_with_symbol_missing.push_back(position_idx); + positions_with_symbol_missing.push_back(position_idx + offset); } } + + missing_symbol_bitmaps[sequence_count + sequence_index].addRange( + offset + maybe_sequence.size(), genome_length + ); + if (!positions_with_symbol_missing.empty()) { missing_symbol_bitmaps[sequence_count + sequence_index].addMany( positions_with_symbol_missing.size(), positions_with_symbol_missing.data() @@ -272,12 +252,10 @@ void silo::SequenceStorePartition::optimizeBitmaps() { } template -void silo::SequenceStorePartition::interpret( - const std::vector>& genomes -) { - fillIndexes(genomes); - fillNBitmaps(genomes); - sequence_count += genomes.size(); +void silo::SequenceStorePartition::flushBuffer(const std::vector& reads) { + fillIndexes(reads); + fillNBitmaps(reads); + sequence_count += reads.size(); } template diff --git a/src/silo/storage/unaligned_sequence_store.cpp b/src/silo/storage/unaligned_sequence_store.cpp index 88548d695..3a4845659 100644 --- a/src/silo/storage/unaligned_sequence_store.cpp +++ b/src/silo/storage/unaligned_sequence_store.cpp @@ -16,7 +16,6 @@ #include "silo/common/symbol_map.h" #include "silo/persistence/exception.h" #include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" silo::UnalignedSequenceStorePartition::UnalignedSequenceStorePartition( std::string sql_for_reading_file, diff --git a/src/silo/zstdfasta/zstd_compressor.cpp b/src/silo/zstd/zstd_compressor.cpp similarity index 96% rename from src/silo/zstdfasta/zstd_compressor.cpp rename to src/silo/zstd/zstd_compressor.cpp index bf0011aa3..cb63d484b 100644 --- a/src/silo/zstdfasta/zstd_compressor.cpp +++ b/src/silo/zstd/zstd_compressor.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_compressor.h" +#include "silo/zstd/zstd_compressor.h" #include #include diff --git a/src/silo/zstdfasta/zstd_context.cpp b/src/silo/zstd/zstd_context.cpp similarity index 95% rename from src/silo/zstdfasta/zstd_context.cpp rename to src/silo/zstd/zstd_context.cpp index 470beb7b3..ceb716aa7 100644 --- a/src/silo/zstdfasta/zstd_context.cpp +++ b/src/silo/zstd/zstd_context.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_context.h" +#include "silo/zstd/zstd_context.h" #include #include diff --git a/src/silo/zstdfasta/zstd_decompressor.cpp b/src/silo/zstd/zstd_decompressor.cpp similarity index 77% rename from src/silo/zstdfasta/zstd_decompressor.cpp rename to src/silo/zstd/zstd_decompressor.cpp index 74530140c..369a43530 100644 --- a/src/silo/zstdfasta/zstd_decompressor.cpp +++ b/src/silo/zstd/zstd_decompressor.cpp @@ -1,21 +1,26 @@ -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" #include #include #include #include +#include namespace silo { ZstdDecompressor::ZstdDecompressor(std::string_view dictionary_string) : zstd_dictionary(ZstdDDictionary(dictionary_string)) {} -std::string_view ZstdDecompressor::decompress(const std::string& input) { - return decompress(input.data(), input.size()); +void ZstdDecompressor::decompress(const std::string& input, std::string& buffer) { + decompress(input.data(), input.size(), buffer); } -std::string_view ZstdDecompressor::decompress(const char* input_data, size_t input_length) { +void ZstdDecompressor::decompress( + const char* input_data, + size_t input_length, + std::string& buffer +) { const size_t uncompressed_size = ZSTD_getFrameContentSize(input_data, input_length); if (uncompressed_size == ZSTD_CONTENTSIZE_UNKNOWN) { throw std::runtime_error(fmt::format( @@ -32,9 +37,7 @@ std::string_view ZstdDecompressor::decompress(const char* input_data, size_t inp input_length )); } - if (uncompressed_size > buffer.size()) { - buffer.resize(uncompressed_size); - } + buffer.resize(uncompressed_size); auto size_or_error_code = ZSTD_decompress_usingDDict( zstd_context.value, buffer.data(), @@ -49,7 +52,7 @@ std::string_view ZstdDecompressor::decompress(const char* input_data, size_t inp fmt::format("Error '{}' in dependency when decompressing using zstd", error_name) ); } - return {buffer.data(), size_or_error_code}; + assert(uncompressed_size == size_or_error_code); } } // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstd_dictionary.cpp b/src/silo/zstd/zstd_dictionary.cpp similarity index 95% rename from src/silo/zstdfasta/zstd_dictionary.cpp rename to src/silo/zstd/zstd_dictionary.cpp index 78b5faeb3..5174de474 100644 --- a/src/silo/zstdfasta/zstd_dictionary.cpp +++ b/src/silo/zstd/zstd_dictionary.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstd_dictionary.h" +#include "silo/zstd/zstd_dictionary.h" #include diff --git a/src/silo/zstd/zstd_table.cpp b/src/silo/zstd/zstd_table.cpp new file mode 100644 index 000000000..05b086338 --- /dev/null +++ b/src/silo/zstd/zstd_table.cpp @@ -0,0 +1,63 @@ +#include "silo/zstd/zstd_table.h" + +#include +#include +#include + +#include "silo/preprocessing/preprocessing_exception.h" +#include "silo/zstd/zstd_compressor.h" + +namespace { +void initializeTable(duckdb::Connection& connection, std::string table_name) { + auto return_value = connection.Query(fmt::format("DROP TABLE IF EXISTS {};", table_name)); + if (return_value->HasError()) { + throw silo::preprocessing::PreprocessingException(return_value->GetError()); + } + return_value = connection.Query(fmt::format( + "CREATE TABLE {} (" + " key STRING," + " read STRUCT(\"offset\" UINTEGER, sequence BLOB)" + ");", + table_name + )); + if (return_value->HasError()) { + throw silo::preprocessing::PreprocessingException(return_value->GetError()); + } +} +} // namespace + +namespace silo { + +using silo::sequence_file_reader::SequenceFileReader; + +ZstdTable ZstdTable::generate( + duckdb::Connection& connection, + const std::string& table_name, + SequenceFileReader& file_reader, + std::string_view reference_sequence +) { + initializeTable(connection, table_name); + std::optional entry; + ZstdCompressor compressor(std::make_shared(reference_sequence, 2)); + duckdb::Appender appender(connection, table_name); + while (true) { + entry = file_reader.nextEntry(); + if (!entry) { + break; + } + const std::string_view compressed = compressor.compress(entry->sequence); + const auto* compressed_data = reinterpret_cast(compressed.data()); + const duckdb::string_t key_value = entry->key; + appender.BeginRow(); + appender.Append(key_value); + appender.Append(duckdb::Value::STRUCT( + {{"\"offset\"", duckdb::Value::UINTEGER(entry.value().offset)}, + {"sequence", duckdb::Value::BLOB(compressed_data, compressed.size())}} + )); + appender.EndRow(); + } + appender.Close(); + return {connection, table_name}; +} + +} // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table_reader.cpp b/src/silo/zstd/zstd_table_reader.cpp similarity index 68% rename from src/silo/zstdfasta/zstdfasta_table_reader.cpp rename to src/silo/zstd/zstd_table_reader.cpp index 2123d5e0e..3e5d58955 100644 --- a/src/silo/zstdfasta/zstdfasta_table_reader.cpp +++ b/src/silo/zstd/zstd_table_reader.cpp @@ -1,4 +1,4 @@ -#include "silo/zstdfasta/zstdfasta_table_reader.h" +#include "silo/zstd/zstd_table_reader.h" #include #include @@ -9,9 +9,9 @@ #include #include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstd_decompressor.h" +#include "silo/zstd/zstd_decompressor.h" -silo::ZstdFastaTableReader::ZstdFastaTableReader( +silo::ZstdTableReader::ZstdTableReader( duckdb::Connection& connection, std::string_view table_name, std::string_view compression_dict, @@ -25,10 +25,10 @@ silo::ZstdFastaTableReader::ZstdFastaTableReader( where_clause(where_clause), order_by_clause(order_by_clause), decompressor(std::make_unique(compression_dict)) { - SPDLOG_TRACE("Initializing ZstdFastaTableReader for table {}", table_name); + SPDLOG_TRACE("Initializing ZstdTableReader for table {}", table_name); } -std::optional silo::ZstdFastaTableReader::nextKey() { +std::optional silo::ZstdTableReader::nextKey() { if (!current_chunk) { return std::nullopt; } @@ -36,7 +36,7 @@ std::optional silo::ZstdFastaTableReader::nextKey() { return current_chunk->GetValue(0, current_row).GetValue(); } -std::optional silo::ZstdFastaTableReader::nextSkipGenome() { +std::optional silo::ZstdTableReader::nextSkipGenome() { auto key = nextKey(); if (!key) { @@ -47,7 +47,7 @@ std::optional silo::ZstdFastaTableReader::nextSkipGenome() { return key; } -std::optional silo::ZstdFastaTableReader::nextCompressed( +std::optional silo::ZstdTableReader::nextCompressed( std::optional& compressed_genome ) { auto key = nextKey(); @@ -59,14 +59,19 @@ std::optional silo::ZstdFastaTableReader::nextCompressed( if (value.IsNull()) { compressed_genome = std::nullopt; } else { - compressed_genome = value.GetValueUnsafe(); + const auto& children = duckdb::StructValue::GetChildren(value); + if (children[1].IsNull()) { + compressed_genome = std::nullopt; + return std::nullopt; + } + compressed_genome = children[1].GetValueUnsafe(); } advanceRow(); return key; } -std::optional silo::ZstdFastaTableReader::next(std::optional& genome) { +std::optional silo::ZstdTableReader::next(std::optional& genome) { std::optional compressed_buffer; auto key = nextCompressed(compressed_buffer); @@ -75,7 +80,11 @@ std::optional silo::ZstdFastaTableReader::next(std::optionaldecompress(compressed_buffer.value()); + if (!genome) { // TODO(#230) this is not good. it is doing the opposite of buffering and + // should be removed + genome = std::string(); + } + decompressor->decompress(compressed_buffer.value(), genome.value()); } else { genome = std::nullopt; } @@ -83,7 +92,7 @@ std::optional silo::ZstdFastaTableReader::next(std::optionalsize()) { current_row = 0; diff --git a/src/silo/zstdfasta/zstdfasta_reader.cpp b/src/silo/zstdfasta/zstdfasta_reader.cpp deleted file mode 100644 index c0b00668a..000000000 --- a/src/silo/zstdfasta/zstdfasta_reader.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_reader.h" - -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/zstdfasta/zstd_decompressor.h" - -silo::ZstdFastaReader::ZstdFastaReader( - const std::filesystem::path& in_file_name, - std::string_view compression_dict -) - : decompressor(std::make_unique(compression_dict)) { - in_file = std::ifstream(in_file_name); - if (!in_file) { - throw std::runtime_error("Could not open file reader for file: " + in_file_name.string()); - } -} - -std::optional silo::ZstdFastaReader::nextKey() { - std::string key_with_prefix; - if (!getline(in_file, key_with_prefix)) { - return std::nullopt; - } - - if (key_with_prefix.empty() || key_with_prefix.at(0) != '>') { - throw FastaFormatException("Fasta key prefix '>' missing for key: " + key_with_prefix); - } - - return key_with_prefix.substr(1); -} - -std::optional silo::ZstdFastaReader::nextSkipGenome() { - auto key = nextKey(); - - if (!key) { - return std::nullopt; - } - - std::string bytestream_length_str; - if (!getline(in_file, bytestream_length_str)) { - throw FastaFormatException("Missing bytestream length in line following key: " + *key); - } - const size_t bytestream_length = std::stoul(bytestream_length_str); - - in_file.ignore(static_cast(bytestream_length)); - return key; -} - -std::optional silo::ZstdFastaReader::nextCompressed(std::string& compressed_genome) { - auto key = nextKey(); - if (!key) { - return std::nullopt; - } - - std::string bytestream_length_str; - if (!getline(in_file, bytestream_length_str)) { - throw FastaFormatException("Missing bytestream length in line following key: " + *key); - } - const size_t bytestream_length = std::stoul(bytestream_length_str); - - compressed_genome.resize(bytestream_length); - in_file.read(compressed_genome.data(), static_cast(compressed_genome.size())); - in_file.ignore(1); - return key; -} - -std::optional silo::ZstdFastaReader::next(std::string& genome) { - std::string compressed_buffer; - auto key = nextCompressed(compressed_buffer); - - if (!key) { - return std::nullopt; - } - genome = decompressor->decompress(compressed_buffer); - return key; -} - -void silo::ZstdFastaReader::reset() { - in_file.clear(); // clear fail and eof bits - in_file.seekg(0, std::ios::beg); // g pointer back to the start -} diff --git a/src/silo/zstdfasta/zstdfasta_reader.test.cpp b/src/silo/zstdfasta/zstdfasta_reader.test.cpp deleted file mode 100644 index 5d78357c2..000000000 --- a/src/silo/zstdfasta/zstdfasta_reader.test.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include -#include -#include - -#include "silo/common/fasta_format_exception.h" -#include "silo/config/preprocessing_config.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" - -TEST(ZstdFastaReader, shouldReadFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaReader, shouldReadFastaFileWithoutNewLineAtEnd) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"no_end_new_line.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaReader, givenDataInWrongFormatThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"wrong_format.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::string genome; - EXPECT_THROW(under_test.next(genome), silo::FastaFormatException); -} - -TEST(ZstdFastaReader, givenDataInWithMissingGenomeThenShouldThrowAnException) { - const std::filesystem::path input_directory{"./testBaseData/fastaFiles/"}; - const std::string sequence_filename{"missing_genome.zstdfasta"}; - - std::filesystem::path file_path = input_directory; - file_path += sequence_filename; - - silo::ZstdFastaReader under_test(file_path, "ACGT"); - - std::optional key; - std::string genome; - key = under_test.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, "Key"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_THROW(key = under_test.next(genome), silo::FastaFormatException); -} \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table.cpp b/src/silo/zstdfasta/zstdfasta_table.cpp deleted file mode 100644 index 6626f9437..000000000 --- a/src/silo/zstdfasta/zstdfasta_table.cpp +++ /dev/null @@ -1,106 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_table.h" - -#include -#include - -#include "silo/common/fasta_reader.h" -#include "silo/preprocessing/preprocessing_exception.h" -#include "silo/zstdfasta/zstd_compressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" - -namespace { -void initializeTable(duckdb::Connection& connection, std::string table_name) { - auto return_value = connection.Query(fmt::format("DROP TABLE IF EXISTS {};", table_name)); - if (return_value->HasError()) { - throw silo::preprocessing::PreprocessingException(return_value->GetError()); - } - return_value = connection.Query(fmt::format( - "CREATE TABLE {} (" - " key STRING," - " sequence BLOB" - ");", - table_name - )); - if (return_value->HasError()) { - throw silo::preprocessing::PreprocessingException(return_value->GetError()); - } -} -} // namespace - -namespace silo { - -ZstdFastaTable::ZstdFastaTable( - duckdb::Connection& connection, - std::string table_name, - std::string_view compression_dict -) - : connection(connection), - table_name(std::move(table_name)), - compression_dict(compression_dict) {} - -ZstdFastaTable ZstdFastaTable::generate( - duckdb::Connection& connection, - const std::string& table_name, - ZstdFastaReader& file_reader, - std::string_view reference_sequence -) { - initializeTable(connection, table_name); - std::optional key; - std::string compressed; - duckdb::Appender appender(connection, table_name); - while (true) { - key = file_reader.nextCompressed(compressed); - if (key == std::nullopt) { - break; - } - const size_t compressed_size = compressed.size(); - const auto* compressed_data = reinterpret_cast(compressed.data()); - const duckdb::string_t key_value = key.value(); - appender.BeginRow(); - appender.Append(key_value); - appender.Append(duckdb::Value::BLOB(compressed_data, compressed_size)); - appender.EndRow(); - } - appender.Close(); - return {connection, table_name, reference_sequence}; -} - -ZstdFastaTable ZstdFastaTable::generate( - duckdb::Connection& connection, - const std::string& table_name, - FastaReader& file_reader, - std::string_view reference_sequence -) { - initializeTable(connection, table_name); - std::optional key; - std::string uncompressed; - ZstdCompressor compressor(std::make_shared(reference_sequence, 2)); - duckdb::Appender appender(connection, table_name); - while (true) { - key = file_reader.next(uncompressed); - if (key == std::nullopt) { - break; - } - const std::string_view compressed = compressor.compress(uncompressed); - const auto* compressed_data = reinterpret_cast(compressed.data()); - const duckdb::string_t key_value = key.value(); - appender.BeginRow(); - appender.Append(key_value); - appender.Append(duckdb::Value::BLOB(compressed_data, compressed.size())); - appender.EndRow(); - } - appender.Close(); - return {connection, table_name, reference_sequence}; -} - -ZstdFastaTableReader ZstdFastaTable::getReader( - std::string_view where_clause, - std::string_view order_by_clause -) { - return ZstdFastaTableReader( - connection, table_name, compression_dict, "sequence", where_clause, order_by_clause - ); -} - -} // namespace silo \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp b/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp deleted file mode 100644 index 5030b2d5a..000000000 --- a/src/silo/zstdfasta/zstdfasta_table_reader.test.cpp +++ /dev/null @@ -1,135 +0,0 @@ -#include -#include -#include - -#include - -#include "silo/common/fasta_reader.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_table.h" -#include "silo/zstdfasta/zstdfasta_table_reader.h" - -TEST(ZstdFastaTableReader, correctlyReadsZstdFastaTableFromFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.fasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::FastaReader file_reader(file_path); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test(connection, "test", "ACGT", "sequence", "true", ""); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlyReadsZstdFastaTableFromZstdFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::ZstdFastaReader file_reader(file_path, "ACGT"); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test(connection, "test", "ACGT", "sequence", "true", ""); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlySortsZstdFastaTableFromFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.fasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::FastaReader file_reader(file_path); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test( - connection, "test", "ACGT", "sequence", "true", "ORDER BY key desc" - ); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} - -TEST(ZstdFastaTableReader, correctlySortsZstdFastaTableFromZstdFastaFile) { - const std::filesystem::path input_directory{"testBaseData/fastaFiles/"}; - const std::string sequence_filename{"test.zstdfasta"}; - - const std::filesystem::path file_path = input_directory / sequence_filename; - - silo::ZstdFastaReader file_reader(file_path, "ACGT"); - - duckdb::DuckDB duck_db(nullptr); - duckdb::Connection connection(duck_db); - - silo::ZstdFastaTable::generate(connection, "test", file_reader, "ACGT"); - - silo::ZstdFastaTableReader under_test( - connection, "test", "ACGT", "sequence", "true", "ORDER BY key desc" - ); - under_test.loadTable(); - - std::optional key; - std::optional genome; - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key2"); - EXPECT_EQ(genome, "CGTA"); - - EXPECT_TRUE(key = under_test.next(genome)); - EXPECT_EQ(key, "Key1"); - EXPECT_EQ(genome, "ACGT"); - - EXPECT_FALSE(key = under_test.next(genome)); -} \ No newline at end of file diff --git a/src/silo/zstdfasta/zstdfasta_writer.cpp b/src/silo/zstdfasta/zstdfasta_writer.cpp deleted file mode 100644 index d0c5a2cfe..000000000 --- a/src/silo/zstdfasta/zstdfasta_writer.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include "silo/zstdfasta/zstdfasta_writer.h" - -#include -#include - -#include - -#include "silo/zstdfasta/zstd_compressor.h" - -silo::ZstdFastaWriter::ZstdFastaWriter( - const std::filesystem::path& out_file, - std::string_view compression_dict -) - : compressor( - std::make_unique(std::make_shared(compression_dict, 2)) - ) { - if (!exists(out_file)) { - SPDLOG_DEBUG("ZSTD Sequence Writer at {} does not yet exist", out_file.string()); - if (!exists(out_file.parent_path())) { - SPDLOG_DEBUG("Parent path {} does not yet exist as well", out_file.string()); - if (!create_directories(out_file.parent_path())) { - SPDLOG_DEBUG("Parent path {} could not be created", out_file.string()); - throw std::runtime_error( - "Could not create zstdwriter for file " + std::string{out_file.string()} - ); - } - } - } - outStream = std::ofstream(out_file.string()); - - if (!outStream) { - throw std::runtime_error( - "Could not create ofstream for file " + std::string{out_file.string()} - ); - } - SPDLOG_DEBUG("ZSTD Sequence Writer at {} successfully created", out_file.string()); -} - -silo::ZstdFastaWriter::ZstdFastaWriter( - const std::filesystem::path& out_file, - std::string_view compression_dict, - const std::string& default_sequence_ -) - : ZstdFastaWriter(out_file, compression_dict) { - default_sequence = std::string(compressor->compress(default_sequence_)); -} - -// NOLINTNEXTLINE(bugprone-easily-swappable-parameters) -void silo::ZstdFastaWriter::write(const std::string& key, const std::string& genome) { - const std::string_view compressed = compressor->compress(genome); - - outStream << '>' << key << '\n' << std::to_string(compressed.size()) << '\n'; - outStream.write(compressed.data(), static_cast(compressed.size())); - outStream << '\n'; -} - -void silo::ZstdFastaWriter::writeRaw(const std::string& key, const std::string& compressed_genome) { - outStream << '>' << key << '\n' - << std::to_string(compressed_genome.size()) << '\n' - << compressed_genome << '\n'; -} - -void silo::ZstdFastaWriter::writeRaw(const std::string& key, std::string_view compressed_genome) { - outStream << '>' << key << '\n' - << std::to_string(compressed_genome.size()) << '\n' - << compressed_genome << '\n'; -} - -void silo::ZstdFastaWriter::writeDefault(const std::string& key) { - if (default_sequence == std::nullopt) { - throw std::runtime_error( - "Tried to write default sequence, when none was provided in the ZstdFastaWriter " - "constructor" - ); - } - outStream << '>' << key << '\n' - << std::to_string(default_sequence->size()) << '\n' - << *default_sequence << '\n'; -} diff --git a/src/silo/zstdfasta/zstdfasta_writer.test.cpp b/src/silo/zstdfasta/zstdfasta_writer.test.cpp deleted file mode 100644 index de0fc22f1..000000000 --- a/src/silo/zstdfasta/zstdfasta_writer.test.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include -#include -#include - -#include "silo/config/preprocessing_config.h" -#include "silo/zstdfasta/zstd_decompressor.h" -#include "silo/zstdfasta/zstdfasta_reader.h" -#include "silo/zstdfasta/zstdfasta_writer.h" - -TEST(ZstdFastaWriter, writesCorrectFiles) { - const std::filesystem::path file_path("./testBaseData/tmp/test.fasta"); - - const std::string reference_genome = "ACGTACGTACGTACGT"; - const std::vector> values{ - {"Key1", "ACGTACGTACGTACGT"}, - {"Key2", "ACGTACGTACGTCCGT"}, - {"Key3", "ACGTACGTACGTACGT"}, - {"Key4", "CAGTTCGTACGTACGT"}, - {"Key5", "ACGTACGTACCTACGC"} - }; - - { - silo::ZstdFastaWriter under_test(file_path, reference_genome); - - for (const auto& value : values) { - under_test.write(value.first, value.second); - } - } - - { - silo::ZstdFastaReader reader(file_path, reference_genome); - - std::optional key; - std::string genome; - - for (const auto& value : values) { - key = reader.next(genome); - EXPECT_TRUE(key != std::nullopt); - EXPECT_EQ(key, value.first); - EXPECT_EQ(genome, value.second); - } - key = reader.next(genome); - EXPECT_FALSE(key != std::nullopt); - } -} diff --git a/testBaseData/fastaFiles/blankLines.fasta b/testBaseData/fastaFiles/blankLines.fasta new file mode 100644 index 000000000..c9dad2bb7 --- /dev/null +++ b/testBaseData/fastaFiles/blankLines.fasta @@ -0,0 +1,9 @@ + + +>Key1 +ACGT + + + +>Key2 +CGTA diff --git a/testBaseData/fastaFiles/withOffset.fasta b/testBaseData/fastaFiles/withOffset.fasta new file mode 100644 index 000000000..1db50d2e9 --- /dev/null +++ b/testBaseData/fastaFiles/withOffset.fasta @@ -0,0 +1,4 @@ +>Key1|10 +ACGT +>Key2|20 +CGTA diff --git a/testBaseData/samFiles/aa_insertions.tsv b/testBaseData/samFiles/aa_insertions.tsv new file mode 100644 index 000000000..36ca1315d --- /dev/null +++ b/testBaseData/samFiles/aa_insertions.tsv @@ -0,0 +1,3 @@ +accessionVersion someLongGene someShortGene +1.1 [214:EPE] [] +1.1 [] [] \ No newline at end of file diff --git a/testBaseData/samFiles/database_config.yaml b/testBaseData/samFiles/database_config.yaml new file mode 100644 index 000000000..cc0a6826c --- /dev/null +++ b/testBaseData/samFiles/database_config.yaml @@ -0,0 +1,13 @@ +schema: + instanceName: Test + metadata: + - name: date + type: date + - name: host + type: string + - name: pangoLineage + type: pango_lineage + - name: accessionVersion + type: string + primaryKey: accessionVersion + dateToSortBy: date diff --git a/testBaseData/samFiles/gene_someLongGene.fasta b/testBaseData/samFiles/gene_someLongGene.fasta new file mode 100644 index 000000000..5272ea776 --- /dev/null +++ b/testBaseData/samFiles/gene_someLongGene.fasta @@ -0,0 +1,2 @@ +>1.1 +ACDEFGHIKLMNPQRSTVWYBZX-* \ No newline at end of file diff --git a/testBaseData/samFiles/gene_someShortGene.fasta b/testBaseData/samFiles/gene_someShortGene.fasta new file mode 100644 index 000000000..6aeb8d1d6 --- /dev/null +++ b/testBaseData/samFiles/gene_someShortGene.fasta @@ -0,0 +1,2 @@ +>1.1 +MADS \ No newline at end of file diff --git a/testBaseData/samFiles/metadata.tsv b/testBaseData/samFiles/metadata.tsv new file mode 100644 index 000000000..d1ef65889 --- /dev/null +++ b/testBaseData/samFiles/metadata.tsv @@ -0,0 +1,3 @@ +date host accessionVersion pangoLineage +2002-12-15 google.com 1.1 XBB.1.5 + 1.3 XBB.1.5 \ No newline at end of file diff --git a/testBaseData/samFiles/nuc_insertions.tsv b/testBaseData/samFiles/nuc_insertions.tsv new file mode 100644 index 000000000..5986c3403 --- /dev/null +++ b/testBaseData/samFiles/nuc_insertions.tsv @@ -0,0 +1,2 @@ +accessionVersion main secondSegment +1.1 [] [] diff --git a/testBaseData/samFiles/nuc_main.sam b/testBaseData/samFiles/nuc_main.sam new file mode 100644 index 000000000..fd200cb3b --- /dev/null +++ b/testBaseData/samFiles/nuc_main.sam @@ -0,0 +1,2 @@ +1.1 99 NC_045512.2 10 60 5S246M = 201 404 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +1.3 99 NC_045512.2 5 60 47S204M = 209 400 TATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; diff --git a/testBaseData/samFiles/nuc_secondSegment.fasta b/testBaseData/samFiles/nuc_secondSegment.fasta new file mode 100644 index 000000000..e69de29bb diff --git a/testBaseData/samFiles/preprocessing_config.yaml b/testBaseData/samFiles/preprocessing_config.yaml new file mode 100644 index 000000000..139c1112a --- /dev/null +++ b/testBaseData/samFiles/preprocessing_config.yaml @@ -0,0 +1,2 @@ +inputDirectory: "testBaseData/samFiles" +metadataFilename: "metadata.tsv" diff --git a/testBaseData/samFiles/reference_genomes.json b/testBaseData/samFiles/reference_genomes.json new file mode 100644 index 000000000..059643fd9 --- /dev/null +++ b/testBaseData/samFiles/reference_genomes.json @@ -0,0 +1,22 @@ +{ + "nucleotideSequences": [ + { + "name": "main", + "sequence": "ATTAAAGGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCT" + }, + { + "name": "secondSegment", + "sequence": "AAAAAAAAAAAAAAAA" + } + ], + "genes": [ + { + "name": "someLongGene", + "sequence": "AAAAAAAAAAAAAAAAAAAAAAAAA" + }, + { + "name": "someShortGene", + "sequence": "MADS" + } + ] +} diff --git a/testBaseData/samFiles/unaligned_main.sam b/testBaseData/samFiles/unaligned_main.sam new file mode 100644 index 000000000..fe158d060 --- /dev/null +++ b/testBaseData/samFiles/unaligned_main.sam @@ -0,0 +1,2 @@ +1.1 99 NC_045512.2 44 60 5S246M = 201 404 CACA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:197C48 MC:Z:247M AS:i:241 XS:i:0 +1.3 99 NC_045512.2 45 60 47S204M = 209 400 CACA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC5CCCCCCCCCCCCCCCCC*CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC NM:i:1 MD:Z:196C7 MC:Z:236M AS:i:199 XS:i:0 SA:Z:NC_045512.2,7769,+,30M221S,60,0; diff --git a/testBaseData/samFiles/unaligned_secondSegment.fasta b/testBaseData/samFiles/unaligned_secondSegment.fasta new file mode 100644 index 000000000..e69de29bb