Skip to content

Commit

Permalink
feat: support SAM files as sequence input and allow partial sequence …
Browse files Browse the repository at this point in the history
…input with an offset
  • Loading branch information
David Gichev authored and Taepper committed Aug 5, 2024
1 parent 427d920 commit 5be9a9f
Show file tree
Hide file tree
Showing 66 changed files with 880 additions and 1,002 deletions.
25 changes: 0 additions & 25 deletions include/silo/common/fasta_reader.h

This file was deleted.

5 changes: 4 additions & 1 deletion include/silo/common/input_stream_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,20 @@
#include <filesystem>
#include <fstream>
#include <memory>
#include <sstream>

#include <boost/iostreams/filtering_stream.hpp>

namespace silo {
class InputStreamWrapper {
private:
std::ifstream file;
std::ifstream file_stream;
std::istringstream string_stream;
std::unique_ptr<boost::iostreams::filtering_istream> input_stream;

public:
explicit InputStreamWrapper(const std::filesystem::path& filename);
explicit InputStreamWrapper(const std::string& content);

[[nodiscard]] std::istream& getInputStream() const;
};
Expand Down
16 changes: 11 additions & 5 deletions include/silo/preprocessing/preprocessing_database.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

namespace silo {

class ZstdFastaTable;
class ZstdTable;
class ReferenceGenomes;
class CompressSequence;

Expand Down Expand Up @@ -42,16 +42,22 @@ class PreprocessingDatabase {

std::unique_ptr<duckdb::MaterializedQueryResult> query(std::string sql_query);

ZstdFastaTable generateSequenceTableFromFasta(
ZstdTable generateSequenceTableViaFile(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
const std::filesystem::path& file_path
);

ZstdFastaTable generateSequenceTableFromZstdFasta(
ZstdTable generateSequenceTableFromFasta(
const std::string& table_name,
const std::string& reference_sequence,
const std::string& filename
const std::filesystem::path& file_name
);

ZstdTable generateSequenceTableFromSAM(
const std::string& table_name,
const std::string& reference_sequence,
const std::filesystem::path& file_name
);
};

Expand Down
15 changes: 15 additions & 0 deletions include/silo/preprocessing/preprocessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,14 @@

#include <optional>

#include "silo/common/table_reader.h"
#include "silo/config/database_config.h"
#include "silo/config/preprocessing_config.h"
#include "silo/preprocessing/preprocessing_database.h"
#include "silo/storage/pango_lineage_alias.h"
#include "silo/storage/reference_genomes.h"
#include "silo/storage/sequence_store.h"
#include "silo/zstd/zstd_decompressor.h"

namespace silo {
class Database;
Expand Down Expand Up @@ -91,6 +94,18 @@ class Preprocessor {
const std::string& order_by_clause
);

template <typename SymbolType>
ColumnFunction createRawReadLambda(
ZstdDecompressor& decompressor,
silo::SequenceStorePartition<SymbolType>& sequence_store
);

template <typename SymbolType>
ColumnFunction createInsertionLambda(
const std::string& sequence_name,
silo::SequenceStorePartition<SymbolType>& sequence_store
);

template <typename SymbolType>
void buildSequenceStore(
Database& database,
Expand Down
2 changes: 1 addition & 1 deletion include/silo/preprocessing/sql_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
#include <duckdb.hpp>

#include "silo/storage/pango_lineage_alias.h"
#include "silo/zstdfasta/zstd_compressor.h"
#include "silo/zstd/zstd_compressor.h"

namespace silo {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
#include <stdexcept>
#include <string>

namespace silo {
namespace silo::sequence_file_reader {

class FastaFormatException : public std::runtime_error {
public:
explicit FastaFormatException(const std::string& error_message);
explicit FastaFormatException(const std::string& error_message)
: std::runtime_error(error_message.c_str()){};
};

} // namespace silo
} // namespace silo::sequence_file_reader
17 changes: 17 additions & 0 deletions include/silo/sequence_file_reader/fasta_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

#include "silo/sequence_file_reader/sequence_file_reader.h"

namespace silo::sequence_file_reader {

class FastaReader : public SequenceFileReader {
public:
explicit FastaReader(const std::filesystem::path& in_file_name)
: SequenceFileReader(in_file_name) {}
std::optional<ReadSequence> nextEntry() override;
};
} // namespace silo::sequence_file_reader
14 changes: 14 additions & 0 deletions include/silo/sequence_file_reader/sam_format_exception.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

#include <stdexcept>
#include <string>

namespace silo::sequence_file_reader {

class SamFormatException : public std::runtime_error {
public:
explicit SamFormatException(const std::string& error_message)
: std::runtime_error(error_message.c_str()){};
};

} // namespace silo::sequence_file_reader
19 changes: 19 additions & 0 deletions include/silo/sequence_file_reader/sam_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#pragma once

#include <filesystem>
#include <optional>
#include <string>

#include "silo/sequence_file_reader/sequence_file_reader.h"

namespace silo::sequence_file_reader {

class SamReader : public SequenceFileReader {
public:
explicit SamReader(const std::filesystem::path& in_file_name)
: SequenceFileReader(in_file_name) {}
explicit SamReader(const std::string& file_content)
: SequenceFileReader(file_content) {}
std::optional<ReadSequence> nextEntry() override;
};
} // namespace silo::sequence_file_reader
29 changes: 29 additions & 0 deletions include/silo/sequence_file_reader/sequence_file_reader.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include <optional>
#include <string>

#include "silo/common/input_stream_wrapper.h"

namespace silo::sequence_file_reader {
class SequenceFileReader {
protected:
explicit SequenceFileReader(const std::filesystem::path& in_file_name)
: in_file(in_file_name){};
explicit SequenceFileReader(const std::string& file_content)
: in_file(file_content){};

silo::InputStreamWrapper in_file;

public:
struct ReadSequence {
std::string key;
uint32_t offset;
std::string sequence;
};

virtual std::optional<ReadSequence> nextEntry() = 0;

virtual ~SequenceFileReader(){};
};
} // namespace silo::sequence_file_reader
30 changes: 24 additions & 6 deletions include/silo/storage/sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
#include <vector>

#include <fmt/format.h>
#include <duckdb/main/connection.hpp>
#include <roaring/roaring.hh>

#include "silo/common/aa_symbols.h"
#include "silo/common/format_number.h"
#include "silo/common/nucleotide_symbols.h"
#include "silo/common/symbol_map.h"
#include "silo/common/table_reader.h"
#include "silo/storage/insertion_index.h"
#include "silo/storage/position.h"

Expand All @@ -25,14 +27,27 @@ class access;
namespace silo {
template <typename SymbolType>
class Position;
class ZstdFastaTableReader;
class ZstdTableReader;

struct SequenceStoreInfo {
uint32_t sequence_count;
uint64_t size;
size_t n_bitmaps_size;
};

struct ReadSequence {
bool is_valid = false;
std::string sequence = "";
uint32_t offset;

ReadSequence(std::string_view _sequence, uint32_t _offset = 0)
: sequence(std::move(_sequence)),
offset(_offset),
is_valid(true) {}

ReadSequence() {}
};

template <typename SymbolType>
class SequenceStorePartition {
friend class boost::serialization::access;
Expand Down Expand Up @@ -60,18 +75,23 @@ class SequenceStorePartition {
uint32_t sequence_count = 0;

private:
void fillIndexes(const std::vector<std::optional<std::string>>& genomes);
static constexpr size_t BUFFER_SIZE = 1024;
std::vector<ReadSequence> lazy_buffer;

void fillIndexes(const std::vector<ReadSequence>& reads);

void addSymbolsToPositions(
size_t position_idx,
SymbolMap<SymbolType, std::vector<uint32_t>>& ids_per_symbol_for_current_position,
size_t number_of_sequences
);

void fillNBitmaps(const std::vector<std::optional<std::string>>& genomes);
void fillNBitmaps(const std::vector<ReadSequence>& reads);

void optimizeBitmaps();

void flushBuffer(const std::vector<ReadSequence>& reads);

public:
explicit SequenceStorePartition(
const std::vector<typename SymbolType::Symbol>& reference_sequence
Expand All @@ -86,12 +106,10 @@ class SequenceStorePartition {

[[nodiscard]] SequenceStoreInfo getInfo() const;

size_t fill(silo::ZstdFastaTableReader& input);
ReadSequence& appendNewSequenceRead();

void insertInsertion(size_t row_id, const std::string& insertion_and_position);

void interpret(const std::vector<std::optional<std::string>>& genomes);

void finalize();
};

Expand Down
2 changes: 1 addition & 1 deletion include/silo/storage/unaligned_sequence_store.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class access;
} // namespace boost::serialization

namespace silo {
class ZstdFastaTableReader;
class ZstdTableReader;

/// Holds information where to read unaligned sequences for a
/// segment (= the sequence of a particular name) in one partition.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

#include <zstd.h>

#include "silo/zstdfasta/zstd_context.h"
#include "silo/zstdfasta/zstd_dictionary.h"
#include "silo/zstd/zstd_context.h"
#include "silo/zstd/zstd_dictionary.h"

namespace silo {

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,13 @@ namespace silo {
class ZstdDecompressor {
ZstdDDictionary zstd_dictionary;
ZstdDContext zstd_context;
std::string buffer;

public:
explicit ZstdDecompressor(std::string_view dictionary_string);

std::string_view decompress(const std::string& input);
void decompress(const std::string& input, std::string& buffer);

std::string_view decompress(const char* input_data, size_t input_length);
void decompress(const char* input_data, size_t input_length, std::string& buffer);
};

} // namespace silo
File renamed without changes.
30 changes: 30 additions & 0 deletions include/silo/zstd/zstd_table.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

#include <string>

#include "silo/sequence_file_reader/sequence_file_reader.h"

namespace duckdb {
struct Connection;
}

namespace silo {

class ZstdTable {
duckdb::Connection& connection;
std::string table_name;

ZstdTable(duckdb::Connection& connection, std::string table_name)
: connection(connection),
table_name(std::move(table_name)){};

public:
static ZstdTable generate(
duckdb::Connection& connection,
const std::string& table_name,
sequence_file_reader::SequenceFileReader& file_reader,
std::string_view reference_sequence
);
};

} // namespace silo
Loading

0 comments on commit 5be9a9f

Please sign in to comment.