From 448a6dedf04ffee3aeebd7a62a800bd31c182331 Mon Sep 17 00:00:00 2001 From: Fabian Engelniederhammer Date: Tue, 16 Jul 2024 21:27:39 +0200 Subject: [PATCH] feat: have no default sequence by default, implement default amino acid sequence from config closes #454 --- include/silo/config/database_config.h | 3 +- src/silo/config/database_config.cpp | 32 ++- src/silo/config/database_config.test.cpp | 38 ++++ src/silo/database.cpp | 2 +- src/silo/test/default_sequence.test.cpp | 183 ++++++++++++++++++ .../exampleDataset/database_config.yaml | 3 +- 6 files changed, 250 insertions(+), 11 deletions(-) create mode 100644 src/silo/test/default_sequence.test.cpp diff --git a/include/silo/config/database_config.h b/include/silo/config/database_config.h index 097b45483..95a898046 100644 --- a/include/silo/config/database_config.h +++ b/include/silo/config/database_config.h @@ -36,7 +36,8 @@ class DatabaseSchema { class DatabaseConfig { public: - std::string default_nucleotide_sequence; + std::optional default_nucleotide_sequence; + std::optional default_amino_acid_sequence; DatabaseSchema schema; [[nodiscard]] std::optional getMetadata(const std::string& name) const; diff --git a/src/silo/config/database_config.cpp b/src/silo/config/database_config.cpp index 6f9a45476..39feface5 100644 --- a/src/silo/config/database_config.cpp +++ b/src/silo/config/database_config.cpp @@ -38,6 +38,9 @@ ValueType silo::config::toDatabaseValueType(std::string_view type) { namespace { +const std::string DEFAULT_NUCLEOTIDE_SEQUENCE_KEY = "defaultNucleotideSequence"; +const std::string DEFAULT_AMINO_ACID_SEQUENCE_KEY = "defaultAminoAcidSequence"; + std::string toString(ValueType type) { switch (type) { case ValueType::STRING: @@ -63,10 +66,15 @@ struct convert { static bool decode(const Node& node, silo::config::DatabaseConfig& config) { config.schema = node["schema"].as(); - if (node["defaultNucleotideSequence"].IsDefined()) { - config.default_nucleotide_sequence = node["defaultNucleotideSequence"].as(); - } else { - config.default_nucleotide_sequence = "main"; + if (node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].IsDefined() && + !node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].IsNull()) { + config.default_nucleotide_sequence = + node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].as(); + } + if (node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].IsDefined() && + !node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].IsNull()) { + config.default_amino_acid_sequence = + node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].as(); } SPDLOG_TRACE("Resulting database config: {}", config); @@ -77,8 +85,11 @@ struct convert { Node node; node["schema"] = config.schema; - if (config.default_nucleotide_sequence != "main") { - node["defaultNucleotideSequence"] = config.default_nucleotide_sequence; + if (config.default_nucleotide_sequence.has_value()) { + node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY] = *config.default_nucleotide_sequence; + } + if (config.default_amino_acid_sequence.has_value()) { + node[DEFAULT_AMINO_ACID_SEQUENCE_KEY] = *config.default_amino_acid_sequence; } return node; } @@ -233,8 +244,13 @@ DatabaseConfig DatabaseConfigReader::parseYaml(const std::string& yaml) const { ) -> decltype(ctx.out()) { return fmt::format_to( ctx.out(), - "{{ default_nucleotide_sequence: '{}', schema: {} }}", - database_config.default_nucleotide_sequence, + "{{ default_nucleotide_sequence: {}, default_amino_acid_sequence: {}, schema: {} }}", + database_config.default_nucleotide_sequence.has_value() + ? "'" + *database_config.default_nucleotide_sequence + "'" + : "null", + database_config.default_amino_acid_sequence.has_value() + ? "'" + *database_config.default_nucleotide_sequence + "'" + : "null", database_config.schema ); } diff --git a/src/silo/config/database_config.test.cpp b/src/silo/config/database_config.test.cpp index 21cfaea98..2e7bdb091 100644 --- a/src/silo/config/database_config.test.cpp +++ b/src/silo/config/database_config.test.cpp @@ -138,6 +138,8 @@ TEST(DatabaseConfigReader, shouldReadConfigWithCorrectParameters) { ASSERT_EQ(config.schema.metadata[8].name, "qc_value"); ASSERT_EQ(config.schema.metadata[8].type, ValueType::FLOAT); ASSERT_EQ(config.schema.metadata[8].generate_index, false); + ASSERT_EQ(config.default_nucleotide_sequence, std::nullopt); + ASSERT_EQ(config.default_amino_acid_sequence, std::nullopt); } TEST(DatabaseConfigReader, shouldThrowExceptionWhenConfigFileDoesNotExist) { @@ -224,4 +226,40 @@ TEST(DatabaseConfigReader, shouldReadConfigWithoutPartitionBy) { ASSERT_EQ(config.schema.partition_by, std::nullopt); } +TEST(DatabaseConfigReader, shouldReadConfigWithDefaultSequencesSet) { + const auto* yaml = R"-( +schema: + instanceName: dummy without partitionBy + metadata: + - name: primaryKey + type: string + primaryKey: primaryKey +defaultNucleotideSequence: defaultNuc +defaultAminoAcidSequence: defaultAA +)-"; + + const DatabaseConfig& config = DatabaseConfigReader().parseYaml(yaml); + + ASSERT_EQ(config.default_nucleotide_sequence, "defaultNuc"); + ASSERT_EQ(config.default_amino_acid_sequence, "defaultAA"); +} + +TEST(DatabaseConfigReader, shouldReadConfigWithDefaultSequencesSetButNull) { + const auto* yaml = R"-( +schema: + instanceName: dummy without partitionBy + metadata: + - name: primaryKey + type: string + primaryKey: primaryKey +defaultNucleotideSequence: null +defaultAminoAcidSequence: null +)-"; + + const DatabaseConfig& config = DatabaseConfigReader().parseYaml(yaml); + + ASSERT_EQ(config.default_nucleotide_sequence, std::nullopt); + ASSERT_EQ(config.default_amino_acid_sequence, std::nullopt); +} + } // namespace diff --git a/src/silo/database.cpp b/src/silo/database.cpp index aba4253a0..1c4bb1b8d 100644 --- a/src/silo/database.cpp +++ b/src/silo/database.cpp @@ -76,7 +76,7 @@ std::optional Database::getDefaultSequenceName() const template <> std::optional Database::getDefaultSequenceName() const { - return std::nullopt; + return database_config.default_amino_acid_sequence; } template <> diff --git a/src/silo/test/default_sequence.test.cpp b/src/silo/test/default_sequence.test.cpp new file mode 100644 index 000000000..eeb90a197 --- /dev/null +++ b/src/silo/test/default_sequence.test.cpp @@ -0,0 +1,183 @@ +#include + +#include + +#include "silo/test/query_fixture.test.h" + +using silo::ReferenceGenomes; +using silo::config::DatabaseConfig; +using silo::config::ValueType; +using silo::test::QueryTestData; +using silo::test::QueryTestScenario; + +namespace { +const std::string VALUE_SEGMENT_1 = "A"; +const std::string VALUE_SEGMENT_2 = "C"; + +const nlohmann::json DATA_DIFFERENT_FROM_REFERENCE = { + {"metadata", {{"primaryKey", "id"}}}, + {"alignedNucleotideSequences", {{"segment1", VALUE_SEGMENT_1}, {"segment2", VALUE_SEGMENT_2}}}, + {"unalignedNucleotideSequences", {{"segment1", nullptr}, {"segment2", nullptr}}}, + {"alignedAminoAcidSequences", + {{"gene1", VALUE_SEGMENT_1 + "*"}, {"gene2", VALUE_SEGMENT_2 + "*"}}}, + {"nucleotideInsertions", {{"segment1", {"1:AAA"}}, {"segment2", {"1:GGG"}}}}, + {"aminoAcidInsertions", {{"gene1", {"1:AAA"}}, {"gene2", {"1:GGG"}}}} +}; + +const nlohmann::json DATA_EQUALS_TO_REFERENCE = { + {"metadata", {{"primaryKey", "equal to reference"}}}, + {"alignedNucleotideSequences", {{"segment1", "T"}, {"segment2", "T"}}}, + {"unalignedNucleotideSequences", {{"segment1", nullptr}, {"segment2", nullptr}}}, + {"alignedAminoAcidSequences", {{"gene1", "T*"}, {"gene2", "T*"}}}, + {"nucleotideInsertions", {{"segment1", {}}, {"segment2", {}}}}, + {"aminoAcidInsertions", {{"gene1", {}}, {"gene2", {}}}} +}; + +const auto DATABASE_CONFIG = DatabaseConfig{ + .default_nucleotide_sequence = "segment1", + .default_amino_acid_sequence = "gene1", + .schema = + {.instance_name = "dummy name", + .metadata = {{.name = "primaryKey", .type = ValueType::STRING}}, + .primary_key = "primaryKey"} +}; + +const auto REFERENCE_GENOMES = ReferenceGenomes{ + {{"segment1", "T"}, {"segment2", "T"}}, + {{"gene1", "T*"}, {"gene2", "T*"}}, +}; + +const QueryTestData TEST_DATA{ + .ndjson_input_data = {DATA_DIFFERENT_FROM_REFERENCE, DATA_EQUALS_TO_REFERENCE}, + .database_config = DATABASE_CONFIG, + .reference_genomes = REFERENCE_GENOMES +}; + +nlohmann::json createQueryWithFilter(const nlohmann::json filter) { + return {{"action", {{"type", "Details"}}}, {"filterExpression", filter}}; +} + +const nlohmann::json EXPECTED_RESULT = {{{"primaryKey", "id"}}}; + +const QueryTestScenario NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME = { + .name = "nucleotideEqualsWithoutSegmentTakesDefaultSequence", + .query = createQueryWithFilter( + {{"type", "NucleotideEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_1}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE = { + .name = "nucleotideEqualsWithoutSegmentFilterByWrongValue", + .query = createQueryWithFilter( + {{"type", "NucleotideEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_2}} + ), + .expected_query_result = nlohmann::json::array() +}; + +const QueryTestScenario NUCLEOTIDE_EQUALS_SEGMENT_1 = { + .name = "nucleotideEqualsSegment1", + .query = createQueryWithFilter( + {{"type", "NucleotideEquals"}, + {"sequenceName", "segment1"}, + {"position", 1}, + {"symbol", VALUE_SEGMENT_1}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario NUCLEOTIDE_EQUALS_SEGMENT_2 = { + .name = "nucleotideEqualsSegment2", + .query = createQueryWithFilter( + {{"type", "NucleotideEquals"}, + {"sequenceName", "segment2"}, + {"position", 1}, + {"symbol", VALUE_SEGMENT_2}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario AMINO_ACID_EQUALS_NO_SEQUENCE_NAME = { + .name = "aminoAcidEqualsWithoutSequenceNameTakesDefaultSequence", + .query = createQueryWithFilter( + {{"type", "AminoAcidEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_1}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario AMINO_ACID_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE = { + .name = "aminoAcidEqualsWithoutSequenceNameFilterByWrongValue", + .query = createQueryWithFilter( + {{"type", "AminoAcidEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_2}} + ), + .expected_query_result = nlohmann::json::array() +}; + +const QueryTestScenario AMINO_ACID_EQUALS_GENE_1 = { + .name = "aminoAcidEqualsGene1", + .query = createQueryWithFilter( + {{"type", "AminoAcidEquals"}, + {"sequenceName", "gene1"}, + {"position", 1}, + {"symbol", VALUE_SEGMENT_1}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario AMINO_ACID_EQUALS_GENE_2 = { + .name = "aminoAcidEqualsGene2", + .query = createQueryWithFilter( + {{"type", "AminoAcidEquals"}, + {"sequenceName", "gene2"}, + {"position", 1}, + {"symbol", VALUE_SEGMENT_2}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario HAS_NUCLEOTIDE_MUTATION_WITHOUT_SEQUENCE_NAME = { + .name = "hasNucleotideMutationWithoutSequenceName", + .query = createQueryWithFilter({{"type", "HasNucleotideMutation"}, {"position", 1}}), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario HAS_AMINO_ACID_MUTATION_WITHOUT_SEQUENCE_NAME = { + .name = "hasAminoAcidMutationWithoutSequenceName", + .query = createQueryWithFilter({{"type", "HasAminoAcidMutation"}, {"position", 1}}), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario NUCLEOTIDE_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME = { + .name = "nucleotideInsertionContainsWithoutSequenceName", + .query = createQueryWithFilter({{"type", "InsertionContains"}, {"value", "A"}, {"position", 1}}), + .expected_query_result = EXPECTED_RESULT +}; + +const QueryTestScenario AMINO_ACID_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME = { + .name = "aminoAcidInsertionContainsWithoutSequenceName", + .query = createQueryWithFilter( + {{"type", "AminoAcidInsertionContains"}, {"value", "A"}, {"position", 1}} + ), + .expected_query_result = EXPECTED_RESULT +}; + +} // namespace + +QUERY_TEST( + DefaultSequenceTest, + TEST_DATA, + ::testing::Values( + NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME, + NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE, + NUCLEOTIDE_EQUALS_SEGMENT_1, + NUCLEOTIDE_EQUALS_SEGMENT_2, + AMINO_ACID_EQUALS_NO_SEQUENCE_NAME, + AMINO_ACID_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE, + AMINO_ACID_EQUALS_GENE_1, + AMINO_ACID_EQUALS_GENE_2, + HAS_NUCLEOTIDE_MUTATION_WITHOUT_SEQUENCE_NAME, + HAS_AMINO_ACID_MUTATION_WITHOUT_SEQUENCE_NAME, + NUCLEOTIDE_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME, + AMINO_ACID_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME + ) +); diff --git a/testBaseData/exampleDataset/database_config.yaml b/testBaseData/exampleDataset/database_config.yaml index b80226460..816437e48 100644 --- a/testBaseData/exampleDataset/database_config.yaml +++ b/testBaseData/exampleDataset/database_config.yaml @@ -26,4 +26,5 @@ schema: type: boolean primaryKey: gisaid_epi_isl dateToSortBy: date - partitionBy: pango_lineage \ No newline at end of file + partitionBy: pango_lineage +defaultNucleotideSequence: "main"