Skip to content

Commit

Permalink
feat: hide intermediate results of the preprocessing - don't put it i…
Browse files Browse the repository at this point in the history
…n the output
  • Loading branch information
fengelniederhammer committed Aug 9, 2023
1 parent ee9f20e commit 44327b0
Show file tree
Hide file tree
Showing 11 changed files with 67 additions and 41 deletions.
8 changes: 5 additions & 3 deletions docker-compose-for-tests-preprocessing.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ services:
silo:
image: ${SILO_IMAGE}
volumes:
- ./testBaseData:/data
- ./testBaseData/exampleDataset:/preprocessing/input
- ./testBaseData/output:/preprocessing/output
- ./testBaseData/exampleDataset/preprocessing_config.yaml:/app/preprocessing_config.yaml
- ./testBaseData/exampleDataset/test_database_config.yaml:/app/database_config.yaml
- ./logs:/app/logs
command:
- "--preprocessing"
- "--preprocessingConfig=./preprocessing_config_for_ci.yaml"
- "--databaseConfig=./test_database_config.yaml"
1 change: 1 addition & 0 deletions docker_default_preprocessing_config.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
inputDirectory: "/preprocessing/input/"
intermediateResultsDirectory: "/preprocessing/temp/"
outputDirectory: "/preprocessing/output/"
18 changes: 10 additions & 8 deletions include/silo/preprocessing/preprocessing_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ struct OutputDirectory {
};
const OutputDirectory DEFAULT_OUTPUT_DIRECTORY = {"./output/"};

struct IntermediateResultsDirectory {
std::string directory;
};
const OutputDirectory DEFAULT_INTERMEDIATE_RESULTS_DIRECTORY = {"./temp/"};

struct MetadataFilename {
std::string filename;
};
Expand Down Expand Up @@ -51,25 +56,20 @@ struct SortedPartitionsFolder {
};
const SortedPartitionsFolder DEFAULT_SORTED_PARTITIONS_FOLDER = {"partitions_sorted/"};

struct SerializedStateFolder {
std::string folder;
};
const SerializedStateFolder DEFAULT_SERIALIZED_STATE_FOLDER = {"serialized_state/"};

struct ReferenceGenomeFilename {
std::string filename;
};
const ReferenceGenomeFilename DEFAULT_REFERENCE_GENOME_FILENAME = {"reference-genomes.json"};
const ReferenceGenomeFilename DEFAULT_REFERENCE_GENOME_FILENAME = {"reference_genomes.json"};

class PreprocessingConfig {
friend class fmt::formatter<silo::preprocessing::PreprocessingConfig>;

std::filesystem::path input_directory;
std::filesystem::path output_directory;
std::optional<std::filesystem::path> pango_lineage_definition_file;
std::filesystem::path metadata_file;
std::filesystem::path partition_folder;
std::filesystem::path sorted_partition_folder;
std::filesystem::path serialization_folder;
std::filesystem::path reference_genome_file;
std::string nucleotide_sequence_prefix;
std::string gene_prefix;
Expand All @@ -79,17 +79,19 @@ class PreprocessingConfig {

explicit PreprocessingConfig(
const InputDirectory& input_directory_,
const IntermediateResultsDirectory& intermediate_results_directory_,
const OutputDirectory& output_directory_,
const MetadataFilename& metadata_filename_,
const PangoLineageDefinitionFilename& pango_lineage_definition_filename_,
const PartitionsFolder& partition_folder_,
const SortedPartitionsFolder& sorted_partition_folder_,
const SerializedStateFolder& serialization_folder_,
const ReferenceGenomeFilename& reference_genome_filename_,
const NucleotideSequencePrefix& nucleotide_sequence_prefix_,
const GenePrefix& gene_prefix_
);

[[nodiscard]] std::filesystem::path getOutputDirectory() const;

[[nodiscard]] std::optional<std::filesystem::path> getPangoLineageDefinitionFilename() const;

[[nodiscard]] std::filesystem::path getReferenceGenomeFilename() const;
Expand Down
1 change: 1 addition & 0 deletions include/silo/preprocessing/preprocessing_config_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ struct PreprocessingConfig;
struct OptionalPreprocessingConfig {
std::optional<std::filesystem::path> input_directory;
std::optional<std::filesystem::path> output_directory;
std::optional<std::filesystem::path> intermediate_results_directory;
std::optional<std::filesystem::path> metadata_file;
std::optional<std::filesystem::path> pango_lineage_definition_file;
std::optional<std::filesystem::path> partition_folder;
Expand Down
12 changes: 10 additions & 2 deletions src/silo/database.test.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#include "silo/database.h"

#include "filesystem"

#include <gtest/gtest.h>

#include "silo/common/nucleotide_symbols.h"
Expand Down Expand Up @@ -100,9 +102,15 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) {
TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) {
auto first_database = buildTestDatabase();

first_database.saveDatabaseState("output/serialized_state/");
const std::string directory = "output/test_serialized_state/";
if (std::filesystem::exists(directory)) {
std::filesystem::remove_all(directory);
}
std::filesystem::create_directories(directory);

first_database.saveDatabaseState(directory);

auto database = silo::Database::loadDatabaseState("output/serialized_state/");
auto database = silo::Database::loadDatabaseState(directory);

const auto simple_database_info = database.getDatabaseInfo();

Expand Down
27 changes: 19 additions & 8 deletions src/silo/preprocessing/preprocessing_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@ PreprocessingConfig::PreprocessingConfig() = default;

PreprocessingConfig::PreprocessingConfig(
const InputDirectory& input_directory_,
const IntermediateResultsDirectory& intermediate_results_directory_,
const OutputDirectory& output_directory_,
const MetadataFilename& metadata_filename_,
const PangoLineageDefinitionFilename& pango_lineage_definition_filename_,
const PartitionsFolder& partition_folder_,
const SortedPartitionsFolder& sorted_partition_folder_,
const SerializedStateFolder& serialization_folder_,
const ReferenceGenomeFilename& reference_genome_filename_,
const NucleotideSequencePrefix& nucleotide_sequence_prefix_,
const GenePrefix& gene_prefix_
Expand All @@ -72,18 +72,29 @@ PreprocessingConfig::PreprocessingConfig(
}
reference_genome_file = createPath(input_directory, reference_genome_filename_.filename);

const std::filesystem::path output_directory(output_directory_.directory);
if (!std::filesystem::exists(output_directory_.directory)) {
std::filesystem::create_directory(output_directory_.directory);
}
this->output_directory = output_directory_.directory;

partition_folder = createOutputPath(output_directory, partition_folder_.folder);
sorted_partition_folder = createOutputPath(output_directory, sorted_partition_folder_.folder);
serialization_folder = createOutputPath(output_directory, serialization_folder_.folder);
const std::filesystem::path intermediate_results_directory(
intermediate_results_directory_.directory
);
if (!std::filesystem::exists(intermediate_results_directory_.directory)) {
std::filesystem::create_directory(intermediate_results_directory_.directory);
}

partition_folder = createOutputPath(intermediate_results_directory, partition_folder_.folder);
sorted_partition_folder =
createOutputPath(intermediate_results_directory, sorted_partition_folder_.folder);
nucleotide_sequence_prefix = nucleotide_sequence_prefix_.prefix;
gene_prefix = gene_prefix_.prefix;
}

std::filesystem::path PreprocessingConfig::getOutputDirectory() const {
return output_directory;
}

std::optional<std::filesystem::path> PreprocessingConfig::getPangoLineageDefinitionFilename(
) const {
return pango_lineage_definition_file;
Expand Down Expand Up @@ -233,18 +244,18 @@ std::filesystem::path PreprocessingConfig::getGeneSortedPartitionFilename(
) -> decltype(ctx.out()) {
return format_to(
ctx.out(),
"{{ input directory: '{}', pango_lineage_definition_file: {}, "
"{{ input directory: '{}', pango_lineage_definition_file: {}, output_directory: '{}', "
"metadata_file: '{}', partition_folder: '{}', sorted_partition_folder: '{}', "
"serialization_folder: '{}', reference_genome_file: '{}', gene_file_prefix: '{}', "
"reference_genome_file: '{}', gene_file_prefix: '{}', "
"nucleotide_sequence_file_prefix: '{}' }}",
preprocessing_config.input_directory.string(),
preprocessing_config.output_directory.string(),
preprocessing_config.pango_lineage_definition_file.has_value()
? "'" + preprocessing_config.pango_lineage_definition_file->string() + "'"
: "none",
preprocessing_config.metadata_file.string(),
preprocessing_config.partition_folder.string(),
preprocessing_config.sorted_partition_folder.string(),
preprocessing_config.serialization_folder.string(),
preprocessing_config.reference_genome_file.string(),
preprocessing_config.nucleotide_sequence_prefix,
preprocessing_config.gene_prefix
Expand Down
9 changes: 5 additions & 4 deletions src/silo/preprocessing/preprocessing_config_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ using silo::preprocessing::OutputDirectory;
using silo::preprocessing::PangoLineageDefinitionFilename;
using silo::preprocessing::PartitionsFolder;
using silo::preprocessing::ReferenceGenomeFilename;
using silo::preprocessing::SerializedStateFolder;
using silo::preprocessing::SortedPartitionsFolder;

namespace YAML {
Expand All @@ -34,6 +33,7 @@ struct convert<OptionalPreprocessingConfig> {
config = OptionalPreprocessingConfig(
extractStringIfPresent(node, "inputDirectory"),
extractStringIfPresent(node, "outputDirectory"),
extractStringIfPresent(node, "intermediateResultsDirectory"),
extractStringIfPresent(node, "metadataFilename"),
extractStringIfPresent(node, "pangoLineageDefinitionFilename"),
extractStringIfPresent(node, "partitionsFolder"),
Expand Down Expand Up @@ -65,6 +65,10 @@ PreprocessingConfig OptionalPreprocessingConfig::mergeValuesFromOrDefault(
InputDirectory{input_directory.value_or(
other.input_directory.value_or(silo::preprocessing::DEFAULT_INPUT_DIRECTORY.directory)
)},
IntermediateResultsDirectory{
intermediate_results_directory.value_or(other.intermediate_results_directory.value_or(
silo::preprocessing::DEFAULT_INTERMEDIATE_RESULTS_DIRECTORY.directory
))},
OutputDirectory{output_directory.value_or(
other.output_directory.value_or(silo::preprocessing::DEFAULT_OUTPUT_DIRECTORY.directory)
)},
Expand All @@ -81,9 +85,6 @@ PreprocessingConfig OptionalPreprocessingConfig::mergeValuesFromOrDefault(
sorted_partition_folder.value_or(other.sorted_partition_folder.value_or(
silo::preprocessing::DEFAULT_SORTED_PARTITIONS_FOLDER.folder
))},
SerializedStateFolder{serialization_folder.value_or(other.serialization_folder.value_or(
silo::preprocessing::DEFAULT_SERIALIZED_STATE_FOLDER.folder
))},
ReferenceGenomeFilename{reference_genome_file.value_or(other.reference_genome_file.value_or(
silo::preprocessing::DEFAULT_REFERENCE_GENOME_FILENAME.filename
))},
Expand Down
24 changes: 13 additions & 11 deletions src/silo/preprocessing/preprocessing_config_reader.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,26 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithCorrectParametersAndDefaults
);

const std::string input_directory = "./testBaseData/exampleDataset/";
const std::string output_directory = "./output/";
const std::string intermediate_directory = "./temp/";
ASSERT_EQ(config.getMetadataInputFilename(), input_directory + "small_metadata_set.tsv");
ASSERT_EQ(
config.getPangoLineageDefinitionFilename(), input_directory + "pangolineage_alias.json"
);
ASSERT_EQ(
config.getNucPartitionFilename("dummy", 0, 0),
output_directory + "partitions/nuc_dummy/P0_C0.zstdfasta"
intermediate_directory + "partitions/nuc_dummy/P0_C0.zstdfasta"
);
ASSERT_EQ(
config.getGenePartitionFilename("dummy2", 0, 0),
output_directory + "partitions/gene_dummy2/P0_C0.zstdfasta"
intermediate_directory + "partitions/gene_dummy2/P0_C0.zstdfasta"
);
ASSERT_EQ(
config.getNucSortedPartitionFilename("dummy", 2, 1),
output_directory + "partitions_sorted/nuc_dummy/P2_C1.zstdfasta"
intermediate_directory + "partitions_sorted/nuc_dummy/P2_C1.zstdfasta"
);
ASSERT_EQ(
config.getGeneSortedPartitionFilename("dummy", 2, 1),
output_directory + "partitions_sorted/gene_dummy/P2_C1.zstdfasta"
intermediate_directory + "partitions_sorted/gene_dummy/P2_C1.zstdfasta"
);
}

Expand All @@ -59,7 +59,7 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithOverriddenDefaults) {
);

const std::string input_directory = "./testBaseData/exampleDataset/";
const std::string output_directory = "./output/";
const std::string intermediate_directory = "./output/overriddenTemp/";
ASSERT_EQ(config.getMetadataInputFilename(), input_directory + "small_metadata_set.tsv");
ASSERT_EQ(
config.getPangoLineageDefinitionFilename(), input_directory + "pangolineage_alias.json"
Expand All @@ -68,12 +68,13 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithOverriddenDefaults) {
ASSERT_EQ(config.getNucFilename("aligned"), input_directory + "aligned.fasta");
ASSERT_EQ(
config.getNucPartitionFilename("aligned", 0, 1),
output_directory + "folder1/aligned/P0_C1.zstdfasta"
intermediate_directory + "folder1/aligned/P0_C1.zstdfasta"
);
ASSERT_EQ(
config.getNucSortedPartitionFilename("aligned", 2, 3),
output_directory + "folder2/aligned/P2_C3.zstdfasta"
intermediate_directory + "folder2/aligned/P2_C3.zstdfasta"
);
ASSERT_EQ(config.getOutputDirectory(), "./output/custom/");
}

TEST(OptionalPreprocessingConfig, givenLeftHandSideHasValueThenMergeTakesLeftHandSideValue) {
Expand All @@ -87,7 +88,7 @@ TEST(OptionalPreprocessingConfig, givenLeftHandSideHasValueThenMergeTakesLeftHan

ASSERT_EQ(
result.getGeneFilename("dummy"),
std::filesystem::path("./testBaseData/leftTestPrefix_dummy.fasta")
std::filesystem::path("./testBaseData/exampleDataset/leftTestPrefix_dummy.fasta")
);
}

Expand All @@ -101,7 +102,7 @@ TEST(OptionalPreprocessingConfig, givenLeftHandSideHasNotValueThenMergeTakesRigh

ASSERT_EQ(
result.getGeneFilename("dummy"),
std::filesystem::path("./testBaseData/rightTestPrefix_dummy.fasta")
std::filesystem::path("./testBaseData/exampleDataset/rightTestPrefix_dummy.fasta")
);
}

Expand All @@ -113,6 +114,7 @@ TEST(OptionalPreprocessingConfig, givenNeitherSideHasValueThenMergeTakesDefaultV
const auto result = left.mergeValuesFromOrDefault(right);

ASSERT_EQ(
result.getGeneFilename("dummy"), std::filesystem::path("./testBaseData/gene_dummy.fasta")
result.getGeneFilename("dummy"),
std::filesystem::path("./testBaseData/exampleDataset/gene_dummy.fasta")
);
}
2 changes: 1 addition & 1 deletion src/silo_api/api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ class SiloServer : public Poco::Util::ServerApplication {
auto database_preprocessing =
silo::Database::preprocessing(preprocessing_config, database_config);

database_preprocessing.saveDatabaseState(preprocessing_config.getSerializedStateFolder());
database_preprocessing.saveDatabaseState(preprocessing_config.getOutputDirectory());

return Application::EXIT_OK;
};
Expand Down
2 changes: 0 additions & 2 deletions testBaseData/exampleDataset/preprocessing_config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
inputDirectory: "./"
outputDirectory: "./output/"
metadataFilename: "small_metadata_set.tsv"
pangoLineageDefinitionFilename: "pangolineage_alias.json"
referenceGenomeFilename: "reference_genomes.json"
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
inputDirectory: "./testBaseData/exampleDataset/"
outputDirectory: "./output/"
outputDirectory: "./output/custom/"
intermediateResultsDirectory: "./output/overriddenTemp/"
metadataFilename: "small_metadata_set.tsv"
pangoLineageDefinitionFilename: "pangolineage_alias.json"
referenceGenomeFilename: "reference_genomes.json"
partitionsFolder: "folder1/"
sortedPartitionsFolder: "folder2/"
serializedStateFolder: "folder3/"
genePrefix: "aaSeq_"
nucleotideSequencePrefix: ""

0 comments on commit 44327b0

Please sign in to comment.