more tests for randomize and critical bugfix which duplicated the 'of…

…fset+limit'th many value would be inserted twice in the heap-sort algorithm
GenSpectrum · Feb 27, 2024 · 374a0be · 374a0be
1 parent 0773999
commit 374a0be
Show file tree

Hide file tree

Showing 5 changed files with 208 additions and 86 deletions.
diff --git a/endToEndTests/test/queries/DetailsOrderByLimit.json b/endToEndTests/test/queries/DetailsOrderByLimit.json
@@ -25,19 +25,6 @@
     }
   },
   "expectedQueryResult": [
-    {
-      "age": 50,
-      "aminoAcidInsertions": null,
-      "country": "Switzerland",
-      "date": "2021-03-03",
-      "division": "Valais",
-      "gisaid_epi_isl": "EPI_ISL_1408062",
-      "nucleotideInsertions": "22204:CAGAA",
-      "pango_lineage": "B.1.1.7",
-      "qc_value": 0.97,
-      "region": "Europe",
-      "unsorted_date": "2020-11-24"
-    },
     {
       "age": 4,
       "aminoAcidInsertions": "S:214:EPE",
@@ -50,6 +37,19 @@
       "qc_value": 0.98,
       "region": "Europe",
       "unsorted_date": null
+    },
+    {
+      "age": 51,
+      "aminoAcidInsertions": null,
+      "country": "Switzerland",
+      "date": "2021-03-21",
+      "division": "Vaud",
+      "gisaid_epi_isl": "EPI_ISL_1597890",
+      "nucleotideInsertions": "22339:GCTGGT",
+      "pango_lineage": "B.1.1.7",
+      "qc_value": 0.96,
+      "region": null,
+      "unsorted_date": "2021-01-25"
     }
   ]
 }
diff --git a/endToEndTests/test/queries/nOf_2of3_details.json b/endToEndTests/test/queries/nOf_2of3_details.json
@@ -3,7 +3,9 @@
   "query": {
     "action": {
       "type": "Details",
-      "orderByFields": ["gisaid_epi_isl"]
+      "randomize": {
+        "seed": 1232
+      }
     },
     "filterExpression": {
       "type": "N-Of",
@@ -30,17 +32,17 @@
   },
   "expectedQueryResult": [
     {
-      "age": 50,
+      "age": 58,
       "aminoAcidInsertions": null,
       "country": "Switzerland",
-      "date": "2020-11-13",
-      "division": "Solothurn",
-      "gisaid_epi_isl": "EPI_ISL_1005148",
-      "nucleotideInsertions": "25701:CCC",
-      "pango_lineage": "B.1.221",
-      "qc_value": 0.92,
+      "date": "2021-04-28",
+      "division": "Basel-Stadt",
+      "gisaid_epi_isl": "EPI_ISL_2019235",
+      "nucleotideInsertions": null,
+      "pango_lineage": "B.1.1.7",
+      "qc_value": 0.9,
       "region": "Europe",
-      "unsorted_date": "2020-12-17"
+      "unsorted_date": "2021-01-22"
     },
     {
       "age": 50,
@@ -69,17 +71,17 @@
       "unsorted_date": "2021-02-10"
     },
     {
-      "age": 58,
+      "age": 50,
       "aminoAcidInsertions": null,
       "country": "Switzerland",
-      "date": "2021-04-28",
-      "division": "Basel-Stadt",
-      "gisaid_epi_isl": "EPI_ISL_2019235",
-      "nucleotideInsertions": null,
-      "pango_lineage": "B.1.1.7",
-      "qc_value": 0.9,
+      "date": "2020-11-13",
+      "division": "Solothurn",
+      "gisaid_epi_isl": "EPI_ISL_1005148",
+      "nucleotideInsertions": "25701:CCC",
+      "pango_lineage": "B.1.221",
+      "qc_value": 0.92,
       "region": "Europe",
-      "unsorted_date": "2021-01-22"
+      "unsorted_date": "2020-12-17"
     }
   ]
 }
diff --git a/src/silo/query_engine/actions/action.cpp b/src/silo/query_engine/actions/action.cpp
@@ -169,24 +169,24 @@ std::optional<uint32_t> parseOffset(const nlohmann::json& json) {
 }
 
 std::optional<uint32_t> parseRandomizeSeed(const nlohmann::json& json) {
-   if (json.contains("randomize")) {
-      if (json["randomize"].is_boolean()) {
-         if (json["randomize"].get<bool>()) {
-            const uint32_t time_based_seed =
-               std::chrono::system_clock::now().time_since_epoch().count();
-            return time_based_seed;
-         }
-         return std::nullopt;
+   if (!json.contains("randomize")) {
+      return std::nullopt;
+   }
+   if (json["randomize"].is_boolean()) {
+      if (json["randomize"].get<bool>()) {
+         const uint32_t time_based_seed =
+            std::chrono::system_clock::now().time_since_epoch().count();
+         return time_based_seed;
       }
-      CHECK_SILO_QUERY(
-         json["randomize"].is_object() && json["randomize"].contains("seed") &&
-            json["randomize"]["seed"].is_number_unsigned(),
-         "If the action contains 'randomize', it must be either a boolean or an object "
-         "containing an unsigned 'seed'"
-      )
-      return json["randomize"]["seed"].get<uint32_t>();
+      return std::nullopt;
    }
-   return std::nullopt;
+   CHECK_SILO_QUERY(
+      json["randomize"].is_object() && json["randomize"].contains("seed") &&
+         json["randomize"]["seed"].is_number_unsigned(),
+      "If the action contains 'randomize', it must be either a boolean or an object "
+      "containing an unsigned 'seed'"
+   )
+   return json["randomize"]["seed"].get<uint32_t>();
 }
 
 // NOLINTNEXTLINE(readability-identifier-naming)

diff --git a/src/silo/query_engine/actions/details.cpp b/src/silo/query_engine/actions/details.cpp
@@ -131,6 +131,7 @@ std::vector<actions::Tuple> produceSortedTuplesWithLimit(
                my_tuples.back() = current_tuple;
                std::push_heap(my_tuples.begin(), my_tuples.end(), tuple_comparator);
             }
+            iterator++;
             for (; iterator != end; iterator++) {
                tuple_factory.overwrite(current_tuple, *iterator);
                if (tuple_comparator(current_tuple, my_tuples.front())) {

diff --git a/src/silo/test/randomize.test.cpp b/src/silo/test/randomize.test.cpp
@@ -4,34 +4,49 @@
 
 #include "silo/test/query_fixture.test.h"
 
+using nlohmann::json;
+
 using silo::ReferenceGenomes;
 using silo::config::DatabaseConfig;
 using silo::config::ValueType;
 using silo::test::QueryTestData;
 using silo::test::QueryTestScenario;
 
-const std::vector<nlohmann::json> DATA = {
-   {{"metadata", {{"key", "id1"}, {"col", "A"}}},
-    {"alignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"unalignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"alignedAminoAcidSequences", {{"gene1", nullptr}}}},
-   {{"metadata", {{"key", "id2"}, {"col", "A"}}},
-    {"alignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"unalignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"alignedAminoAcidSequences", {{"gene1", nullptr}}}},
-   {{"metadata", {{"key", "id3"}, {"col", "A"}}},
-    {"alignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"unalignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"alignedAminoAcidSequences", {{"gene1", nullptr}}}},
-   {{"metadata", {{"key", "id4"}, {"col", "A"}}},
-    {"alignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"unalignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"alignedAminoAcidSequences", {{"gene1", nullptr}}}},
-   {{"metadata", {{"key", "id5"}, {"col", "A"}}},
-    {"alignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"unalignedNucleotideSequences", {{"segment1", nullptr}}},
-    {"alignedAminoAcidSequences", {{"gene1", nullptr}}}}
-};
+const auto DATA_JSON = R"([
+   {
+      "metadata": {"key": "id1", "col": "A"},
+      "alignedNucleotideSequences": {"segment1": null},
+      "unalignedNucleotideSequences": {"segment1": null},
+      "alignedAminoAcidSequences": {"gene1": null}
+   },
+   {
+      "metadata": {"key": "id2", "col": "B"},
+      "alignedNucleotideSequences": {"segment1": null},
+      "unalignedNucleotideSequences": {"segment1": null},
+      "alignedAminoAcidSequences": {"gene1": null}
+   },
+   {
+      "metadata": {"key": "id3", "col": "A"},
+      "alignedNucleotideSequences": {"segment1": null},
+      "unalignedNucleotideSequences": {"segment1": null},
+      "alignedAminoAcidSequences": {"gene1": null}
+   },
+   {
+      "metadata": {"key": "id4", "col": "B"},
+      "alignedNucleotideSequences": {"segment1": null},
+      "unalignedNucleotideSequences": {"segment1": null},
+      "alignedAminoAcidSequences": {"gene1": null}
+   },
+   {
+      "metadata": {"key": "id5", "col": "A"},
+      "alignedNucleotideSequences": {"segment1": null},
+      "unalignedNucleotideSequences": {"segment1": null},
+      "alignedAminoAcidSequences": {"gene1": null}
+   }
+])";
+
+// Parsing the JSON string to a json object
+const std::vector<json> DATA = json::parse(DATA_JSON);
 
 const auto DATABASE_CONFIG = DatabaseConfig{
    "segment1",
@@ -47,39 +62,143 @@ const QueryTestData TEST_DATA{DATA, DATABASE_CONFIG, REFERENCE_GENOMES};
 
 const QueryTestScenario RANDOMIZE_SEED = {
    "seed1231ProvidedShouldShuffleResults",
-   {{"action", {{"type", "Details"}, {"fields", {"key"}}, {"randomize", {{"seed", 1231}}}}},
-    {"filterExpression", {{"type", "True"}}}},
-   {{{"key", "id4"}}, {{"key", "id1"}}, {{"key", "id5"}}, {{"key", "id2"}}, {{"key", "id3"}}}
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key"], "randomize": {"seed": 1231}},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id4"},
+          {"key": "id1"},
+          {"key": "id5"},
+          {"key": "id2"},
+          {"key": "id3"}])"
+   )
 };
 
 const QueryTestScenario RANDOMIZE_SEED_DIFFERENT = {
    "seed12312ProvidedShouldShuffleResultsDifferently",
-   {{"action", {{"type", "Details"}, {"fields", {"key"}}, {"randomize", {{"seed", 12312}}}}},
-    {"filterExpression", {{"type", "True"}}}},
-   {{{"key", "id1"}}, {{"key", "id4"}}, {{"key", "id3"}}, {{"key", "id2"}}, {{"key", "id5"}}}
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key"], "randomize": {"seed": 12312}},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id1"},
+          {"key": "id4"},
+          {"key": "id3"},
+          {"key": "id2"},
+          {"key": "id5"}])"
+   )
 };
 
 const QueryTestScenario EXPLICIT_DO_NOT_RANDOMIZE = {
    "explicitlyDoNotRandomize",
-   {{"action", {{"type", "Details"}, {"fields", {"key"}}, {"randomize", false}}},
-    {"filterExpression", {{"type", "True"}}}},
-   {{{"key", "id1"}}, {{"key", "id2"}}, {{"key", "id3"}}, {{"key", "id4"}}, {{"key", "id5"}}}
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key"], "randomize": false},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id1"},
+          {"key": "id2"},
+          {"key": "id3"},
+          {"key": "id4"},
+          {"key": "id5"}])"
+   )
 };
 
 const QueryTestScenario AGGREGATE = {
    "aggregateRandomize",
-   {{"action",
-     {{"type", "Aggregated"}, {"groupByFields", {"key"}}, {"randomize", {{"seed", 12321}}}}},
-    {"filterExpression", {{"type", "True"}}}},
-   {{{"count", 1}, {"key", "id3"}},
-    {{"count", 1}, {"key", "id1"}},
-    {{"count", 1}, {"key", "id4"}},
-    {{"count", 1}, {"key", "id5"}},
-    {{"count", 1}, {"key", "id2"}}}
+   json::parse(
+      R"({"action": {"type": "Aggregated", "groupByFields": ["key"], "randomize": {"seed": 12321}},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"count": 1, "key": "id3"},
+          {"count": 1, "key": "id1"},
+          {"count": 1, "key": "id4"},
+          {"count": 1, "key": "id5"},
+          {"count": 1, "key": "id2"}])"
+   )
+};
+
+const QueryTestScenario ORDER_BY_PRECEDENCE = {
+   "orderByTakePrecedenceOverRandomize",
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key", "col"], "randomize": {"seed": 123212}, "orderByFields": ["col"]},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id5", "col": "A"},
+          {"key": "id1", "col": "A"},
+          {"key": "id3", "col": "A"},
+          {"key": "id2", "col": "B"},
+          {"key": "id4", "col": "B"}])"
+   )
+};
+
+const QueryTestScenario ORDER_BY_AGGREGATE_RANDOMIZE = {
+   "orderingByAggregatedCount",
+   json::parse(
+      R"({"action": {"type": "Aggregated", "groupByFields": ["col"], "randomize": true, "orderByFields": ["count"]},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"count": 2, "col": "B"},
+          {"count": 3, "col": "A"}])"
+   )
+};
+
+const QueryTestScenario LIMIT_2_RANDOMIZE = {
+   "detailsWithLimit2AndOffsetRandomized",
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key", "col"],
+                     "orderByFields": ["col", "key"], "limit": 2, "offset": 2},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id5", "col": "A"},
+          {"key": "id2", "col": "B"}])"
+   )
+};
+
+const QueryTestScenario LIMIT_3_RANDOMIZE = {
+   "detailsWithLimit3AndOffsetRandomized",
+   json::parse(
+      R"({"action": {"type": "Details", "fields": ["key", "col"],
+                     "orderByFields": ["col", "key"], "limit": 3, "offset": 2},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"key": "id5", "col": "A"},
+          {"key": "id2", "col": "B"},
+          {"key": "id4", "col": "B"}])"
+   )
+};
+
+const QueryTestScenario AGGREGATE_LIMIT_RANDOMIZE = {
+   "aggregateWithLimitAndOffsetRandomized",
+   json::parse(
+      R"({"action": {"type": "Aggregated", "groupByFields": ["key", "col"], "randomize": {"seed": 123},
+                     "orderByFields": ["col"], "limit": 2, "offset": 1},
+         "filterExpression": {"type": "True"}})"
+   ),
+   json::parse(
+      R"([{"count": 1, "key": "id1", "col": "A"},
+          {"count": 1, "key": "id3", "col": "A"}])"
+   )
 };
 
 QUERY_TEST(
    RandomizeTest,
    TEST_DATA,
-   ::testing::Values(RANDOMIZE_SEED, RANDOMIZE_SEED_DIFFERENT, EXPLICIT_DO_NOT_RANDOMIZE, AGGREGATE)
+   ::testing::Values(
+      RANDOMIZE_SEED,
+      RANDOMIZE_SEED_DIFFERENT,
+      EXPLICIT_DO_NOT_RANDOMIZE,
+      AGGREGATE,
+      ORDER_BY_PRECEDENCE,
+      ORDER_BY_AGGREGATE_RANDOMIZE,
+      LIMIT_2_RANDOMIZE,
+      LIMIT_3_RANDOMIZE,
+      AGGREGATE_LIMIT_RANDOMIZE
+   )
 );