diff --git a/docker-compose-for-tests-preprocessing.yml b/docker-compose-for-tests-preprocessing.yml index ec3c3aa28..77dbe8f71 100644 --- a/docker-compose-for-tests-preprocessing.yml +++ b/docker-compose-for-tests-preprocessing.yml @@ -3,8 +3,10 @@ services: silo: image: ${SILO_IMAGE} volumes: - - ./testBaseData:/data + - ./testBaseData/exampleDataset:/preprocessing/input + - ./testBaseData/output:/preprocessing/output + - ./testBaseData/exampleDataset/preprocessing_config.yaml:/app/preprocessing_config.yaml + - ./testBaseData/exampleDataset/test_database_config.yaml:/app/database_config.yaml + - ./logs:/app/logs command: - "--preprocessing" - - "--preprocessingConfig=./preprocessing_config_for_ci.yaml" - - "--databaseConfig=./test_database_config.yaml" \ No newline at end of file diff --git a/docker_default_preprocessing_config.yaml b/docker_default_preprocessing_config.yaml index dfa25b88b..fb40f9d2f 100644 --- a/docker_default_preprocessing_config.yaml +++ b/docker_default_preprocessing_config.yaml @@ -1,2 +1,3 @@ inputDirectory: "/preprocessing/input/" +intermediateResultsDirectory: "/preprocessing/temp/" outputDirectory: "/preprocessing/output/" diff --git a/include/silo/preprocessing/preprocessing_config.h b/include/silo/preprocessing/preprocessing_config.h index 42fa1f5fd..8025acac9 100644 --- a/include/silo/preprocessing/preprocessing_config.h +++ b/include/silo/preprocessing/preprocessing_config.h @@ -22,6 +22,11 @@ struct OutputDirectory { }; const OutputDirectory DEFAULT_OUTPUT_DIRECTORY = {"./output/"}; +struct IntermediateResultsDirectory { + std::string directory; +}; +const OutputDirectory DEFAULT_INTERMEDIATE_RESULTS_DIRECTORY = {"./temp/"}; + struct MetadataFilename { std::string filename; }; @@ -51,25 +56,20 @@ struct SortedPartitionsFolder { }; const SortedPartitionsFolder DEFAULT_SORTED_PARTITIONS_FOLDER = {"partitions_sorted/"}; -struct SerializedStateFolder { - std::string folder; -}; -const SerializedStateFolder DEFAULT_SERIALIZED_STATE_FOLDER = {"serialized_state/"}; - struct ReferenceGenomeFilename { std::string filename; }; -const ReferenceGenomeFilename DEFAULT_REFERENCE_GENOME_FILENAME = {"reference-genomes.json"}; +const ReferenceGenomeFilename DEFAULT_REFERENCE_GENOME_FILENAME = {"reference_genomes.json"}; class PreprocessingConfig { friend class fmt::formatter; std::filesystem::path input_directory; + std::filesystem::path output_directory; std::optional pango_lineage_definition_file; std::filesystem::path metadata_file; std::filesystem::path partition_folder; std::filesystem::path sorted_partition_folder; - std::filesystem::path serialization_folder; std::filesystem::path reference_genome_file; std::string nucleotide_sequence_prefix; std::string gene_prefix; @@ -79,17 +79,19 @@ class PreprocessingConfig { explicit PreprocessingConfig( const InputDirectory& input_directory_, + const IntermediateResultsDirectory& intermediate_results_directory_, const OutputDirectory& output_directory_, const MetadataFilename& metadata_filename_, const PangoLineageDefinitionFilename& pango_lineage_definition_filename_, const PartitionsFolder& partition_folder_, const SortedPartitionsFolder& sorted_partition_folder_, - const SerializedStateFolder& serialization_folder_, const ReferenceGenomeFilename& reference_genome_filename_, const NucleotideSequencePrefix& nucleotide_sequence_prefix_, const GenePrefix& gene_prefix_ ); + [[nodiscard]] std::filesystem::path getOutputDirectory() const; + [[nodiscard]] std::optional getPangoLineageDefinitionFilename() const; [[nodiscard]] std::filesystem::path getReferenceGenomeFilename() const; diff --git a/include/silo/preprocessing/preprocessing_config_reader.h b/include/silo/preprocessing/preprocessing_config_reader.h index 59d5586d8..7cabd5eb2 100644 --- a/include/silo/preprocessing/preprocessing_config_reader.h +++ b/include/silo/preprocessing/preprocessing_config_reader.h @@ -10,6 +10,7 @@ struct PreprocessingConfig; struct OptionalPreprocessingConfig { std::optional input_directory; std::optional output_directory; + std::optional intermediate_results_directory; std::optional metadata_file; std::optional pango_lineage_definition_file; std::optional partition_folder; diff --git a/src/silo/database.test.cpp b/src/silo/database.test.cpp index 09f0c4c31..bc619c424 100644 --- a/src/silo/database.test.cpp +++ b/src/silo/database.test.cpp @@ -1,5 +1,7 @@ #include "silo/database.h" +#include "filesystem" + #include #include "silo/common/nucleotide_symbols.h" @@ -100,9 +102,15 @@ TEST(DatabaseTest, shouldReturnCorrectDatabaseInfo) { TEST(DatabaseTest, shouldSaveAndReloadDatabaseWithoutErrors) { auto first_database = buildTestDatabase(); - first_database.saveDatabaseState("output/serialized_state/"); + const std::string directory = "output/test_serialized_state/"; + if (std::filesystem::exists(directory)) { + std::filesystem::remove_all(directory); + } + std::filesystem::create_directories(directory); + + first_database.saveDatabaseState(directory); - auto database = silo::Database::loadDatabaseState("output/serialized_state/"); + auto database = silo::Database::loadDatabaseState(directory); const auto simple_database_info = database.getDatabaseInfo(); diff --git a/src/silo/preprocessing/preprocessing_config.cpp b/src/silo/preprocessing/preprocessing_config.cpp index 9629c9592..da96ce0c0 100644 --- a/src/silo/preprocessing/preprocessing_config.cpp +++ b/src/silo/preprocessing/preprocessing_config.cpp @@ -48,12 +48,12 @@ PreprocessingConfig::PreprocessingConfig() = default; PreprocessingConfig::PreprocessingConfig( const InputDirectory& input_directory_, + const IntermediateResultsDirectory& intermediate_results_directory_, const OutputDirectory& output_directory_, const MetadataFilename& metadata_filename_, const PangoLineageDefinitionFilename& pango_lineage_definition_filename_, const PartitionsFolder& partition_folder_, const SortedPartitionsFolder& sorted_partition_folder_, - const SerializedStateFolder& serialization_folder_, const ReferenceGenomeFilename& reference_genome_filename_, const NucleotideSequencePrefix& nucleotide_sequence_prefix_, const GenePrefix& gene_prefix_ @@ -72,18 +72,29 @@ PreprocessingConfig::PreprocessingConfig( } reference_genome_file = createPath(input_directory, reference_genome_filename_.filename); - const std::filesystem::path output_directory(output_directory_.directory); if (!std::filesystem::exists(output_directory_.directory)) { std::filesystem::create_directory(output_directory_.directory); } + this->output_directory = output_directory_.directory; - partition_folder = createOutputPath(output_directory, partition_folder_.folder); - sorted_partition_folder = createOutputPath(output_directory, sorted_partition_folder_.folder); - serialization_folder = createOutputPath(output_directory, serialization_folder_.folder); + const std::filesystem::path intermediate_results_directory( + intermediate_results_directory_.directory + ); + if (!std::filesystem::exists(intermediate_results_directory_.directory)) { + std::filesystem::create_directory(intermediate_results_directory_.directory); + } + + partition_folder = createOutputPath(intermediate_results_directory, partition_folder_.folder); + sorted_partition_folder = + createOutputPath(intermediate_results_directory, sorted_partition_folder_.folder); nucleotide_sequence_prefix = nucleotide_sequence_prefix_.prefix; gene_prefix = gene_prefix_.prefix; } +std::filesystem::path PreprocessingConfig::getOutputDirectory() const { + return output_directory; +} + std::optional PreprocessingConfig::getPangoLineageDefinitionFilename( ) const { return pango_lineage_definition_file; @@ -233,18 +244,18 @@ std::filesystem::path PreprocessingConfig::getGeneSortedPartitionFilename( ) -> decltype(ctx.out()) { return format_to( ctx.out(), - "{{ input directory: '{}', pango_lineage_definition_file: {}, " + "{{ input directory: '{}', pango_lineage_definition_file: {}, output_directory: '{}', " "metadata_file: '{}', partition_folder: '{}', sorted_partition_folder: '{}', " - "serialization_folder: '{}', reference_genome_file: '{}', gene_file_prefix: '{}', " + "reference_genome_file: '{}', gene_file_prefix: '{}', " "nucleotide_sequence_file_prefix: '{}' }}", preprocessing_config.input_directory.string(), + preprocessing_config.output_directory.string(), preprocessing_config.pango_lineage_definition_file.has_value() ? "'" + preprocessing_config.pango_lineage_definition_file->string() + "'" : "none", preprocessing_config.metadata_file.string(), preprocessing_config.partition_folder.string(), preprocessing_config.sorted_partition_folder.string(), - preprocessing_config.serialization_folder.string(), preprocessing_config.reference_genome_file.string(), preprocessing_config.nucleotide_sequence_prefix, preprocessing_config.gene_prefix diff --git a/src/silo/preprocessing/preprocessing_config_reader.cpp b/src/silo/preprocessing/preprocessing_config_reader.cpp index 1f5b20813..6907d59c9 100644 --- a/src/silo/preprocessing/preprocessing_config_reader.cpp +++ b/src/silo/preprocessing/preprocessing_config_reader.cpp @@ -16,7 +16,6 @@ using silo::preprocessing::OutputDirectory; using silo::preprocessing::PangoLineageDefinitionFilename; using silo::preprocessing::PartitionsFolder; using silo::preprocessing::ReferenceGenomeFilename; -using silo::preprocessing::SerializedStateFolder; using silo::preprocessing::SortedPartitionsFolder; namespace YAML { @@ -34,6 +33,7 @@ struct convert { config = OptionalPreprocessingConfig( extractStringIfPresent(node, "inputDirectory"), extractStringIfPresent(node, "outputDirectory"), + extractStringIfPresent(node, "intermediateResultsDirectory"), extractStringIfPresent(node, "metadataFilename"), extractStringIfPresent(node, "pangoLineageDefinitionFilename"), extractStringIfPresent(node, "partitionsFolder"), @@ -65,6 +65,10 @@ PreprocessingConfig OptionalPreprocessingConfig::mergeValuesFromOrDefault( InputDirectory{input_directory.value_or( other.input_directory.value_or(silo::preprocessing::DEFAULT_INPUT_DIRECTORY.directory) )}, + IntermediateResultsDirectory{ + intermediate_results_directory.value_or(other.intermediate_results_directory.value_or( + silo::preprocessing::DEFAULT_INTERMEDIATE_RESULTS_DIRECTORY.directory + ))}, OutputDirectory{output_directory.value_or( other.output_directory.value_or(silo::preprocessing::DEFAULT_OUTPUT_DIRECTORY.directory) )}, @@ -81,9 +85,6 @@ PreprocessingConfig OptionalPreprocessingConfig::mergeValuesFromOrDefault( sorted_partition_folder.value_or(other.sorted_partition_folder.value_or( silo::preprocessing::DEFAULT_SORTED_PARTITIONS_FOLDER.folder ))}, - SerializedStateFolder{serialization_folder.value_or(other.serialization_folder.value_or( - silo::preprocessing::DEFAULT_SERIALIZED_STATE_FOLDER.folder - ))}, ReferenceGenomeFilename{reference_genome_file.value_or(other.reference_genome_file.value_or( silo::preprocessing::DEFAULT_REFERENCE_GENOME_FILENAME.filename ))}, diff --git a/src/silo/preprocessing/preprocessing_config_reader.test.cpp b/src/silo/preprocessing/preprocessing_config_reader.test.cpp index c450698d5..fc53454fb 100644 --- a/src/silo/preprocessing/preprocessing_config_reader.test.cpp +++ b/src/silo/preprocessing/preprocessing_config_reader.test.cpp @@ -19,26 +19,26 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithCorrectParametersAndDefaults ); const std::string input_directory = "./testBaseData/exampleDataset/"; - const std::string output_directory = "./output/"; + const std::string intermediate_directory = "./temp/"; ASSERT_EQ(config.getMetadataInputFilename(), input_directory + "small_metadata_set.tsv"); ASSERT_EQ( config.getPangoLineageDefinitionFilename(), input_directory + "pangolineage_alias.json" ); ASSERT_EQ( config.getNucPartitionFilename("dummy", 0, 0), - output_directory + "partitions/nuc_dummy/P0_C0.zstdfasta" + intermediate_directory + "partitions/nuc_dummy/P0_C0.zstdfasta" ); ASSERT_EQ( config.getGenePartitionFilename("dummy2", 0, 0), - output_directory + "partitions/gene_dummy2/P0_C0.zstdfasta" + intermediate_directory + "partitions/gene_dummy2/P0_C0.zstdfasta" ); ASSERT_EQ( config.getNucSortedPartitionFilename("dummy", 2, 1), - output_directory + "partitions_sorted/nuc_dummy/P2_C1.zstdfasta" + intermediate_directory + "partitions_sorted/nuc_dummy/P2_C1.zstdfasta" ); ASSERT_EQ( config.getGeneSortedPartitionFilename("dummy", 2, 1), - output_directory + "partitions_sorted/gene_dummy/P2_C1.zstdfasta" + intermediate_directory + "partitions_sorted/gene_dummy/P2_C1.zstdfasta" ); } @@ -59,7 +59,7 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithOverriddenDefaults) { ); const std::string input_directory = "./testBaseData/exampleDataset/"; - const std::string output_directory = "./output/"; + const std::string intermediate_directory = "./output/overriddenTemp/"; ASSERT_EQ(config.getMetadataInputFilename(), input_directory + "small_metadata_set.tsv"); ASSERT_EQ( config.getPangoLineageDefinitionFilename(), input_directory + "pangolineage_alias.json" @@ -68,12 +68,13 @@ TEST(PreprocessingConfigReader, shouldReadConfigWithOverriddenDefaults) { ASSERT_EQ(config.getNucFilename("aligned"), input_directory + "aligned.fasta"); ASSERT_EQ( config.getNucPartitionFilename("aligned", 0, 1), - output_directory + "folder1/aligned/P0_C1.zstdfasta" + intermediate_directory + "folder1/aligned/P0_C1.zstdfasta" ); ASSERT_EQ( config.getNucSortedPartitionFilename("aligned", 2, 3), - output_directory + "folder2/aligned/P2_C3.zstdfasta" + intermediate_directory + "folder2/aligned/P2_C3.zstdfasta" ); + ASSERT_EQ(config.getOutputDirectory(), "./output/custom/"); } TEST(OptionalPreprocessingConfig, givenLeftHandSideHasValueThenMergeTakesLeftHandSideValue) { @@ -87,7 +88,7 @@ TEST(OptionalPreprocessingConfig, givenLeftHandSideHasValueThenMergeTakesLeftHan ASSERT_EQ( result.getGeneFilename("dummy"), - std::filesystem::path("./testBaseData/leftTestPrefix_dummy.fasta") + std::filesystem::path("./testBaseData/exampleDataset/leftTestPrefix_dummy.fasta") ); } @@ -101,7 +102,7 @@ TEST(OptionalPreprocessingConfig, givenLeftHandSideHasNotValueThenMergeTakesRigh ASSERT_EQ( result.getGeneFilename("dummy"), - std::filesystem::path("./testBaseData/rightTestPrefix_dummy.fasta") + std::filesystem::path("./testBaseData/exampleDataset/rightTestPrefix_dummy.fasta") ); } @@ -113,6 +114,7 @@ TEST(OptionalPreprocessingConfig, givenNeitherSideHasValueThenMergeTakesDefaultV const auto result = left.mergeValuesFromOrDefault(right); ASSERT_EQ( - result.getGeneFilename("dummy"), std::filesystem::path("./testBaseData/gene_dummy.fasta") + result.getGeneFilename("dummy"), + std::filesystem::path("./testBaseData/exampleDataset/gene_dummy.fasta") ); } diff --git a/src/silo_api/api.cpp b/src/silo_api/api.cpp index f73e96ebf..fb9d3fd37 100644 --- a/src/silo_api/api.cpp +++ b/src/silo_api/api.cpp @@ -180,7 +180,7 @@ class SiloServer : public Poco::Util::ServerApplication { auto database_preprocessing = silo::Database::preprocessing(preprocessing_config, database_config); - database_preprocessing.saveDatabaseState(preprocessing_config.getSerializedStateFolder()); + database_preprocessing.saveDatabaseState(preprocessing_config.getOutputDirectory()); return Application::EXIT_OK; }; diff --git a/testBaseData/exampleDataset/preprocessing_config.yaml b/testBaseData/exampleDataset/preprocessing_config.yaml index 9bbd33390..ce690ea9d 100644 --- a/testBaseData/exampleDataset/preprocessing_config.yaml +++ b/testBaseData/exampleDataset/preprocessing_config.yaml @@ -1,5 +1,3 @@ -inputDirectory: "./" -outputDirectory: "./output/" metadataFilename: "small_metadata_set.tsv" pangoLineageDefinitionFilename: "pangolineage_alias.json" referenceGenomeFilename: "reference_genomes.json" diff --git a/testBaseData/test_preprocessing_config_with_overridden_defaults.yaml b/testBaseData/test_preprocessing_config_with_overridden_defaults.yaml index 4b84137f6..268383ce6 100644 --- a/testBaseData/test_preprocessing_config_with_overridden_defaults.yaml +++ b/testBaseData/test_preprocessing_config_with_overridden_defaults.yaml @@ -1,10 +1,10 @@ inputDirectory: "./testBaseData/exampleDataset/" -outputDirectory: "./output/" +outputDirectory: "./output/custom/" +intermediateResultsDirectory: "./output/overriddenTemp/" metadataFilename: "small_metadata_set.tsv" pangoLineageDefinitionFilename: "pangolineage_alias.json" referenceGenomeFilename: "reference_genomes.json" partitionsFolder: "folder1/" sortedPartitionsFolder: "folder2/" -serializedStateFolder: "folder3/" genePrefix: "aaSeq_" nucleotideSequencePrefix: "" \ No newline at end of file