From 8fa8efb5f7634e9eb91ad71d0d5d3af47c0fcf30 Mon Sep 17 00:00:00 2001 From: Alexander Taepper Date: Wed, 12 Jul 2023 15:05:54 +0200 Subject: [PATCH] fix: add sleep statement before row call this is necessary, because row[column_name] does not free the memory fast enough --- src/silo/prepare_dataset.cpp | 34 ----------------------------- src/silo/preprocessing/metadata.cpp | 24 ++++++++++++++------ 2 files changed, 17 insertions(+), 41 deletions(-) diff --git a/src/silo/prepare_dataset.cpp b/src/silo/prepare_dataset.cpp index d3c8ba6c9..e4e6fffa3 100644 --- a/src/silo/prepare_dataset.cpp +++ b/src/silo/prepare_dataset.cpp @@ -59,40 +59,6 @@ const std::string TSV_EXTENSION(".tsv"); } } -[[maybe_unused]] void silo::pruneSequences( - silo::preprocessing::MetadataReader& metadata_reader, - silo::FastaReader& sequences_in, - std::ostream& sequences_out, - const silo::config::DatabaseConfig& database_config -) { - SPDLOG_INFO("Pruning sequences"); - - const auto primary_key_vector = metadata_reader.getColumn(database_config.schema.primary_key); - const std::unordered_set primary_keys( - primary_key_vector.begin(), primary_key_vector.end() - ); - - SPDLOG_INFO("Finished reading metadata, found {} rows", primary_keys.size()); - - uint32_t found_sequences_count = 0; - { - std::optional key; - std::string genome; - while (true) { - key = sequences_in.next(genome); - if (!key.has_value()) { - break; - } - if (primary_keys.contains(*key)) { - found_sequences_count++; - sequences_out << *key << "\n" << genome << "\n"; - sequences_out << *key << "\n" << genome << "\n"; - } - } - } - SPDLOG_INFO("Finished reading sequences, found {} sequences", found_sequences_count); -} - std::unordered_map> getMetadataWritersForChunks( const std::filesystem::path& output_folder, diff --git a/src/silo/preprocessing/metadata.cpp b/src/silo/preprocessing/metadata.cpp index 2a4db0a80..f3d053a8f 100644 --- a/src/silo/preprocessing/metadata.cpp +++ b/src/silo/preprocessing/metadata.cpp @@ -11,14 +11,23 @@ namespace silo::preprocessing { -MetadataReader::MetadataReader(const std::filesystem::path& metadata_path) try - : reader(metadata_path.string()) { -} catch (const std::exception& exception) { - const std::string message = - "Failed to read metadata file '" + metadata_path.string() + "': " + exception.what(); - throw PreprocessingException(message); +csv::CSVReader buildReader(const std::filesystem::path& metadata_path) { + try { + csv::CSVFormat format; + format.delimiter('\t'); + format.variable_columns(csv::VariableColumnPolicy::THROW); + format.header_row(0); + return {metadata_path.string(), format}; + } catch (const std::exception& exception) { + const std::string message = + "Failed to read metadata file '" + metadata_path.string() + "': " + exception.what(); + throw PreprocessingException(message); + } } +MetadataReader::MetadataReader(const std::filesystem::path& metadata_path) + : reader(buildReader(metadata_path)) {} + std::vector MetadataReader::getColumn(const std::string& column_name) { if (reader.index_of(column_name) == csv::CSV_NOT_FOUND) { const std::string message = "Failed to read metadata column '" + column_name + "'"; @@ -26,7 +35,8 @@ std::vector MetadataReader::getColumn(const std::string& column_nam } std::vector column; for (const auto& row : reader) { - column.push_back(row[column_name].get()); + std::this_thread::sleep_for(std::chrono::nanoseconds(1)); + column.emplace_back(row[column_name].get()); } return column; }