Skip to content

Commit

Permalink
feat: have no default sequence by default, implement default amino ac…
Browse files Browse the repository at this point in the history
…id sequence from config

closes #454
  • Loading branch information
fengelniederhammer committed Jul 18, 2024
1 parent 2c7f3b9 commit 448a6de
Show file tree
Hide file tree
Showing 6 changed files with 250 additions and 11 deletions.
3 changes: 2 additions & 1 deletion include/silo/config/database_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ class DatabaseSchema {

class DatabaseConfig {
public:
std::string default_nucleotide_sequence;
std::optional<std::string> default_nucleotide_sequence;
std::optional<std::string> default_amino_acid_sequence;
DatabaseSchema schema;

[[nodiscard]] std::optional<DatabaseMetadata> getMetadata(const std::string& name) const;
Expand Down
32 changes: 24 additions & 8 deletions src/silo/config/database_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ ValueType silo::config::toDatabaseValueType(std::string_view type) {

namespace {

const std::string DEFAULT_NUCLEOTIDE_SEQUENCE_KEY = "defaultNucleotideSequence";
const std::string DEFAULT_AMINO_ACID_SEQUENCE_KEY = "defaultAminoAcidSequence";

std::string toString(ValueType type) {
switch (type) {
case ValueType::STRING:
Expand All @@ -63,10 +66,15 @@ struct convert<silo::config::DatabaseConfig> {
static bool decode(const Node& node, silo::config::DatabaseConfig& config) {
config.schema = node["schema"].as<silo::config::DatabaseSchema>();

if (node["defaultNucleotideSequence"].IsDefined()) {
config.default_nucleotide_sequence = node["defaultNucleotideSequence"].as<std::string>();
} else {
config.default_nucleotide_sequence = "main";
if (node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].IsDefined() &&
!node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].IsNull()) {
config.default_nucleotide_sequence =
node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY].as<std::string>();
}
if (node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].IsDefined() &&
!node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].IsNull()) {
config.default_amino_acid_sequence =
node[DEFAULT_AMINO_ACID_SEQUENCE_KEY].as<std::string>();
}

SPDLOG_TRACE("Resulting database config: {}", config);
Expand All @@ -77,8 +85,11 @@ struct convert<silo::config::DatabaseConfig> {
Node node;
node["schema"] = config.schema;

if (config.default_nucleotide_sequence != "main") {
node["defaultNucleotideSequence"] = config.default_nucleotide_sequence;
if (config.default_nucleotide_sequence.has_value()) {
node[DEFAULT_NUCLEOTIDE_SEQUENCE_KEY] = *config.default_nucleotide_sequence;
}
if (config.default_amino_acid_sequence.has_value()) {
node[DEFAULT_AMINO_ACID_SEQUENCE_KEY] = *config.default_amino_acid_sequence;
}
return node;
}
Expand Down Expand Up @@ -233,8 +244,13 @@ DatabaseConfig DatabaseConfigReader::parseYaml(const std::string& yaml) const {
) -> decltype(ctx.out()) {
return fmt::format_to(
ctx.out(),
"{{ default_nucleotide_sequence: '{}', schema: {} }}",
database_config.default_nucleotide_sequence,
"{{ default_nucleotide_sequence: {}, default_amino_acid_sequence: {}, schema: {} }}",
database_config.default_nucleotide_sequence.has_value()
? "'" + *database_config.default_nucleotide_sequence + "'"
: "null",
database_config.default_amino_acid_sequence.has_value()
? "'" + *database_config.default_nucleotide_sequence + "'"
: "null",
database_config.schema
);
}
Expand Down
38 changes: 38 additions & 0 deletions src/silo/config/database_config.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,8 @@ TEST(DatabaseConfigReader, shouldReadConfigWithCorrectParameters) {
ASSERT_EQ(config.schema.metadata[8].name, "qc_value");
ASSERT_EQ(config.schema.metadata[8].type, ValueType::FLOAT);
ASSERT_EQ(config.schema.metadata[8].generate_index, false);
ASSERT_EQ(config.default_nucleotide_sequence, std::nullopt);
ASSERT_EQ(config.default_amino_acid_sequence, std::nullopt);
}

TEST(DatabaseConfigReader, shouldThrowExceptionWhenConfigFileDoesNotExist) {
Expand Down Expand Up @@ -224,4 +226,40 @@ TEST(DatabaseConfigReader, shouldReadConfigWithoutPartitionBy) {
ASSERT_EQ(config.schema.partition_by, std::nullopt);
}

TEST(DatabaseConfigReader, shouldReadConfigWithDefaultSequencesSet) {
const auto* yaml = R"-(
schema:
instanceName: dummy without partitionBy
metadata:
- name: primaryKey
type: string
primaryKey: primaryKey
defaultNucleotideSequence: defaultNuc
defaultAminoAcidSequence: defaultAA
)-";

const DatabaseConfig& config = DatabaseConfigReader().parseYaml(yaml);

ASSERT_EQ(config.default_nucleotide_sequence, "defaultNuc");
ASSERT_EQ(config.default_amino_acid_sequence, "defaultAA");
}

TEST(DatabaseConfigReader, shouldReadConfigWithDefaultSequencesSetButNull) {
const auto* yaml = R"-(
schema:
instanceName: dummy without partitionBy
metadata:
- name: primaryKey
type: string
primaryKey: primaryKey
defaultNucleotideSequence: null
defaultAminoAcidSequence: null
)-";

const DatabaseConfig& config = DatabaseConfigReader().parseYaml(yaml);

ASSERT_EQ(config.default_nucleotide_sequence, std::nullopt);
ASSERT_EQ(config.default_amino_acid_sequence, std::nullopt);
}

} // namespace
2 changes: 1 addition & 1 deletion src/silo/database.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ std::optional<std::string> Database::getDefaultSequenceName<Nucleotide>() const

template <>
std::optional<std::string> Database::getDefaultSequenceName<AminoAcid>() const {
return std::nullopt;
return database_config.default_amino_acid_sequence;
}

template <>
Expand Down
183 changes: 183 additions & 0 deletions src/silo/test/default_sequence.test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#include <nlohmann/json.hpp>

#include <optional>

#include "silo/test/query_fixture.test.h"

using silo::ReferenceGenomes;
using silo::config::DatabaseConfig;
using silo::config::ValueType;
using silo::test::QueryTestData;
using silo::test::QueryTestScenario;

namespace {
const std::string VALUE_SEGMENT_1 = "A";
const std::string VALUE_SEGMENT_2 = "C";

const nlohmann::json DATA_DIFFERENT_FROM_REFERENCE = {
{"metadata", {{"primaryKey", "id"}}},
{"alignedNucleotideSequences", {{"segment1", VALUE_SEGMENT_1}, {"segment2", VALUE_SEGMENT_2}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}, {"segment2", nullptr}}},
{"alignedAminoAcidSequences",
{{"gene1", VALUE_SEGMENT_1 + "*"}, {"gene2", VALUE_SEGMENT_2 + "*"}}},
{"nucleotideInsertions", {{"segment1", {"1:AAA"}}, {"segment2", {"1:GGG"}}}},
{"aminoAcidInsertions", {{"gene1", {"1:AAA"}}, {"gene2", {"1:GGG"}}}}
};

const nlohmann::json DATA_EQUALS_TO_REFERENCE = {
{"metadata", {{"primaryKey", "equal to reference"}}},
{"alignedNucleotideSequences", {{"segment1", "T"}, {"segment2", "T"}}},
{"unalignedNucleotideSequences", {{"segment1", nullptr}, {"segment2", nullptr}}},
{"alignedAminoAcidSequences", {{"gene1", "T*"}, {"gene2", "T*"}}},
{"nucleotideInsertions", {{"segment1", {}}, {"segment2", {}}}},
{"aminoAcidInsertions", {{"gene1", {}}, {"gene2", {}}}}
};

const auto DATABASE_CONFIG = DatabaseConfig{
.default_nucleotide_sequence = "segment1",
.default_amino_acid_sequence = "gene1",
.schema =
{.instance_name = "dummy name",
.metadata = {{.name = "primaryKey", .type = ValueType::STRING}},
.primary_key = "primaryKey"}
};

const auto REFERENCE_GENOMES = ReferenceGenomes{
{{"segment1", "T"}, {"segment2", "T"}},
{{"gene1", "T*"}, {"gene2", "T*"}},
};

const QueryTestData TEST_DATA{
.ndjson_input_data = {DATA_DIFFERENT_FROM_REFERENCE, DATA_EQUALS_TO_REFERENCE},
.database_config = DATABASE_CONFIG,
.reference_genomes = REFERENCE_GENOMES
};

nlohmann::json createQueryWithFilter(const nlohmann::json filter) {
return {{"action", {{"type", "Details"}}}, {"filterExpression", filter}};
}

const nlohmann::json EXPECTED_RESULT = {{{"primaryKey", "id"}}};

const QueryTestScenario NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME = {
.name = "nucleotideEqualsWithoutSegmentTakesDefaultSequence",
.query = createQueryWithFilter(
{{"type", "NucleotideEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_1}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE = {
.name = "nucleotideEqualsWithoutSegmentFilterByWrongValue",
.query = createQueryWithFilter(
{{"type", "NucleotideEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_2}}
),
.expected_query_result = nlohmann::json::array()
};

const QueryTestScenario NUCLEOTIDE_EQUALS_SEGMENT_1 = {
.name = "nucleotideEqualsSegment1",
.query = createQueryWithFilter(
{{"type", "NucleotideEquals"},
{"sequenceName", "segment1"},
{"position", 1},
{"symbol", VALUE_SEGMENT_1}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario NUCLEOTIDE_EQUALS_SEGMENT_2 = {
.name = "nucleotideEqualsSegment2",
.query = createQueryWithFilter(
{{"type", "NucleotideEquals"},
{"sequenceName", "segment2"},
{"position", 1},
{"symbol", VALUE_SEGMENT_2}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario AMINO_ACID_EQUALS_NO_SEQUENCE_NAME = {
.name = "aminoAcidEqualsWithoutSequenceNameTakesDefaultSequence",
.query = createQueryWithFilter(
{{"type", "AminoAcidEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_1}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario AMINO_ACID_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE = {
.name = "aminoAcidEqualsWithoutSequenceNameFilterByWrongValue",
.query = createQueryWithFilter(
{{"type", "AminoAcidEquals"}, {"position", 1}, {"symbol", VALUE_SEGMENT_2}}
),
.expected_query_result = nlohmann::json::array()
};

const QueryTestScenario AMINO_ACID_EQUALS_GENE_1 = {
.name = "aminoAcidEqualsGene1",
.query = createQueryWithFilter(
{{"type", "AminoAcidEquals"},
{"sequenceName", "gene1"},
{"position", 1},
{"symbol", VALUE_SEGMENT_1}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario AMINO_ACID_EQUALS_GENE_2 = {
.name = "aminoAcidEqualsGene2",
.query = createQueryWithFilter(
{{"type", "AminoAcidEquals"},
{"sequenceName", "gene2"},
{"position", 1},
{"symbol", VALUE_SEGMENT_2}}
),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario HAS_NUCLEOTIDE_MUTATION_WITHOUT_SEQUENCE_NAME = {
.name = "hasNucleotideMutationWithoutSequenceName",
.query = createQueryWithFilter({{"type", "HasNucleotideMutation"}, {"position", 1}}),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario HAS_AMINO_ACID_MUTATION_WITHOUT_SEQUENCE_NAME = {
.name = "hasAminoAcidMutationWithoutSequenceName",
.query = createQueryWithFilter({{"type", "HasAminoAcidMutation"}, {"position", 1}}),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario NUCLEOTIDE_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME = {
.name = "nucleotideInsertionContainsWithoutSequenceName",
.query = createQueryWithFilter({{"type", "InsertionContains"}, {"value", "A"}, {"position", 1}}),
.expected_query_result = EXPECTED_RESULT
};

const QueryTestScenario AMINO_ACID_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME = {
.name = "aminoAcidInsertionContainsWithoutSequenceName",
.query = createQueryWithFilter(
{{"type", "AminoAcidInsertionContains"}, {"value", "A"}, {"position", 1}}
),
.expected_query_result = EXPECTED_RESULT
};

} // namespace

QUERY_TEST(
DefaultSequenceTest,
TEST_DATA,
::testing::Values(
NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME,
NUCLEOTIDE_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE,
NUCLEOTIDE_EQUALS_SEGMENT_1,
NUCLEOTIDE_EQUALS_SEGMENT_2,
AMINO_ACID_EQUALS_NO_SEQUENCE_NAME,
AMINO_ACID_EQUALS_NO_SEQUENCE_NAME_FILTER_BY_WRONG_VALUE,
AMINO_ACID_EQUALS_GENE_1,
AMINO_ACID_EQUALS_GENE_2,
HAS_NUCLEOTIDE_MUTATION_WITHOUT_SEQUENCE_NAME,
HAS_AMINO_ACID_MUTATION_WITHOUT_SEQUENCE_NAME,
NUCLEOTIDE_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME,
AMINO_ACID_INSERTION_CONTAINS_WITHOUT_SEQUENCE_NAME
)
);
3 changes: 2 additions & 1 deletion testBaseData/exampleDataset/database_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,5 @@ schema:
type: boolean
primaryKey: gisaid_epi_isl
dateToSortBy: date
partitionBy: pango_lineage
partitionBy: pango_lineage
defaultNucleotideSequence: "main"

0 comments on commit 448a6de

Please sign in to comment.