Skip to content

Commit

Permalink
Merge pull request #170 from GenSpectrum/hotfixAlexJonas
Browse files Browse the repository at this point in the history
fix: metadata reader for large files
  • Loading branch information
fengelniederhammer authored Jul 17, 2023
2 parents 27b992a + 8fa8efb commit 04a3fe4
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 41 deletions.
34 changes: 0 additions & 34 deletions src/silo/prepare_dataset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,40 +59,6 @@ const std::string TSV_EXTENSION(".tsv");
}
}

[[maybe_unused]] void silo::pruneSequences(
silo::preprocessing::MetadataReader& metadata_reader,
silo::FastaReader& sequences_in,
std::ostream& sequences_out,
const silo::config::DatabaseConfig& database_config
) {
SPDLOG_INFO("Pruning sequences");

const auto primary_key_vector = metadata_reader.getColumn(database_config.schema.primary_key);
const std::unordered_set<std::string> primary_keys(
primary_key_vector.begin(), primary_key_vector.end()
);

SPDLOG_INFO("Finished reading metadata, found {} rows", primary_keys.size());

uint32_t found_sequences_count = 0;
{
std::optional<std::string> key;
std::string genome;
while (true) {
key = sequences_in.next(genome);
if (!key.has_value()) {
break;
}
if (primary_keys.contains(*key)) {
found_sequences_count++;
sequences_out << *key << "\n" << genome << "\n";
sequences_out << *key << "\n" << genome << "\n";
}
}
}
SPDLOG_INFO("Finished reading sequences, found {} sequences", found_sequences_count);
}

std::unordered_map<std::string, std::unique_ptr<silo::preprocessing::MetadataWriter>>
getMetadataWritersForChunks(
const std::filesystem::path& output_folder,
Expand Down
24 changes: 17 additions & 7 deletions src/silo/preprocessing/metadata.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,22 +11,32 @@

namespace silo::preprocessing {

MetadataReader::MetadataReader(const std::filesystem::path& metadata_path) try
: reader(metadata_path.string()) {
} catch (const std::exception& exception) {
const std::string message =
"Failed to read metadata file '" + metadata_path.string() + "': " + exception.what();
throw PreprocessingException(message);
csv::CSVReader buildReader(const std::filesystem::path& metadata_path) {
try {
csv::CSVFormat format;
format.delimiter('\t');
format.variable_columns(csv::VariableColumnPolicy::THROW);
format.header_row(0);
return {metadata_path.string(), format};
} catch (const std::exception& exception) {
const std::string message =
"Failed to read metadata file '" + metadata_path.string() + "': " + exception.what();
throw PreprocessingException(message);
}
}

MetadataReader::MetadataReader(const std::filesystem::path& metadata_path)
: reader(buildReader(metadata_path)) {}

std::vector<std::string> MetadataReader::getColumn(const std::string& column_name) {
if (reader.index_of(column_name) == csv::CSV_NOT_FOUND) {
const std::string message = "Failed to read metadata column '" + column_name + "'";
throw PreprocessingException(message);
}
std::vector<std::string> column;
for (const auto& row : reader) {
column.push_back(row[column_name].get());
std::this_thread::sleep_for(std::chrono::nanoseconds(1));
column.emplace_back(row[column_name].get<std::string>());
}
return column;
}
Expand Down

0 comments on commit 04a3fe4

Please sign in to comment.