fix: don't abort when reading table in chunks

The bug was introduced in #483 and only occured when the table reader tried to read the second chunk. Thus: also add a preprocessor test with a dataset that is large enough so that the bug occurs.
GenSpectrum · Jun 25, 2024 · 26b1558 · 26b1558
1 parent e012211
commit 26b1558
Show file tree

Hide file tree

Showing 7 changed files with 334 additions and 7 deletions.
diff --git a/src/silo/common/table_reader.cpp b/src/silo/common/table_reader.cpp
@@ -55,8 +55,7 @@ size_t silo::TableReader::read() {
                for (size_t row_in_chunk = 0; row_in_chunk < current_chunk_size; row_in_chunk++) {
                   column_functions.at(column_idx)
                      .function(
-                        current_start_of_chunk + row_in_chunk,
-                        column.GetValue(current_start_of_chunk + row_in_chunk)
+                        current_start_of_chunk + row_in_chunk, column.GetValue(row_in_chunk)
                      );
                }
             }

diff --git a/src/silo/preprocessing/preprocessor.test.cpp b/src/silo/preprocessing/preprocessor.test.cpp
@@ -193,8 +193,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = {
          }
       }
    )",
-   .expected_query_result = nlohmann::json::parse(R"(
-[{"count":30}])")
+   .expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
 };
 
 const Scenario NO_SEQUENCES = {
@@ -210,8 +209,23 @@ const Scenario NO_SEQUENCES = {
          }
       }
    )",
-   .expected_query_result = nlohmann::json::parse(R"(
-[{"count":30}])")
+   .expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
+};
+
+const Scenario MEDIUM_SIZED_RSV_DATASET = {
+   .input_directory = "testBaseData/mediumSizedRsvDataset/",
+   .expected_sequence_count = 19662,
+   .query = R"(
+      {
+         "action": {
+           "type": "Aggregated"
+         },
+         "filterExpression": {
+            "type": "True"
+         }
+      }
+   )",
+   .expected_query_result = nlohmann::json::parse(R"([{"count":19662}])")
 };
 
 class PreprocessorTestFixture : public ::testing::TestWithParam<Scenario> {};
@@ -230,7 +244,8 @@ INSTANTIATE_TEST_SUITE_P(
       EMPTY_INPUT_NDJSON_UNPARTITIONED,
       NO_GENES,
       NO_NUCLEOTIDE_SEQUENCES,
-      NO_SEQUENCES
+      NO_SEQUENCES,
+      MEDIUM_SIZED_RSV_DATASET
    ),
    printTestName
 );

diff --git a/src/silo/storage/column_group.cpp b/src/silo/storage/column_group.cpp
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include <fmt/format.h>
+#include <spdlog/spdlog.h>
 #include <boost/algorithm/string.hpp>
 #include <duckdb.hpp>
 
@@ -65,6 +66,10 @@ void ColumnPartitionGroup::addValueToColumn(
          float_columns.at(column_name).insert(value.ToString());
          return;
    }
+   SPDLOG_ERROR(
+      "Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
+   );
+   abort();
 }
 
 void ColumnPartitionGroup::addNullToColumn(const std::string& column_name, ColumnType column_type) {
@@ -91,6 +96,9 @@ void ColumnPartitionGroup::addNullToColumn(const std::string& column_name, Colum
          float_columns.at(column_name).insertNull();
          return;
    }
+   SPDLOG_ERROR(
+      "Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
+   );
    abort();
 }
 
@@ -122,6 +130,9 @@ void ColumnPartitionGroup::reserveSpaceInColumn(
          float_columns.at(column_name).reserve(row_count);
          return;
    }
+   SPDLOG_ERROR(
+      "Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
+   );
    abort();
 }
 

diff --git a/testBaseData/mediumSizedRsvDataset/data.ndjson.zst b/testBaseData/mediumSizedRsvDataset/data.ndjson.zst
diff --git a/testBaseData/mediumSizedRsvDataset/database_config.yaml b/testBaseData/mediumSizedRsvDataset/database_config.yaml
@@ -0,0 +1,254 @@
+schema:
+  instanceName: "RSV A"
+  opennessLevel: OPEN
+  metadata:
+    - name: accession
+      type: string
+    - name: version
+      type: int
+    - name: submissionId
+      type: string
+    - name: accessionVersion
+      type: string
+    - name: isRevocation
+      type: boolean
+    - name: submitter
+      type: string
+      generateIndex: true
+    - name: groupId
+      type: int
+    - name: groupName
+      type: string
+      generateIndex: true
+    - name: submittedAt
+      type: int
+    - name: releasedAt
+      type: int
+    - name: dataUseTerms
+      type: string
+      generateIndex: true
+    - name: dataUseTermsRestrictedUntil
+      type: date
+    - name: versionStatus
+      type: string
+    - name: sample_collection_date
+      type: date
+    - name: ncbi_release_date
+      type: date
+    - name: ncbi_update_date
+      type: date
+    - name: geo_loc_country
+      type: string
+      generateIndex: true
+    - name: geo_loc_admin_1
+      type: string
+      generateIndex: true
+    - name: geo_loc_admin_2
+      type: string
+      generateIndex: true
+    - name: geo_loc_city
+      type: string
+      generateIndex: true
+    - name: geo_loc_site
+      type: string
+    - name: specimen_collector_sample_id
+      type: string
+    - name: authors
+      type: string
+    - name: author_affiliations
+      type: string
+      generateIndex: true
+    - name: ncbi_submitter_country
+      type: string
+      generateIndex: true
+    - name: insdc_accession_base
+      type: string
+    - name: insdc_version
+      type: int
+    - name: insdc_accession_full
+      type: string
+    - name: bioproject_accessions
+      type: string
+    - name: biosample_accession
+      type: string
+    - name: culture_id
+      type: string
+    - name: sample_received_date
+      type: date
+    - name: sample_type
+      type: string
+    - name: purpose_of_sampling
+      type: string
+    - name: presampling_activity
+      type: string
+    - name: anatomical_material
+      type: string
+    - name: anatomical_part
+      type: string
+    - name: body_product
+      type: string
+    - name: environmental_material
+      type: string
+    - name: environmental_site
+      type: string
+    - name: collection_device
+      type: string
+    - name: collection_method
+      type: string
+    - name: food_product
+      type: string
+    - name: food_product_properties
+      type: string
+    - name: specimen_processing
+      type: string
+    - name: specimen_processing_details
+      type: string
+    - name: experimental_specimen_role_type
+      type: string
+    - name: host_age
+      type: int
+    - name: host_age_bin
+      type: string
+    - name: host_gender
+      type: string
+    - name: host_origin_country
+      type: string
+    - name: host_disease
+      type: string
+    - name: signs_and_symptoms
+      type: string
+    - name: host_health_state
+      type: string
+    - name: host_health_outcome
+      type: string
+    - name: travel_history
+      type: string
+    - name: exposure_event
+      type: string
+    - name: host_role
+      type: string
+    - name: exposure_setting
+      type: string
+    - name: exposure_details
+      type: string
+    - name: previous_infection_disease
+      type: string
+    - name: previous_infection_organism
+      type: string
+    - name: host_vaccination_status
+      type: string
+    - name: purpose_of_sequencing
+      type: string
+    - name: sequencing_date
+      type: date
+    - name: amplicon_pcr_primer_scheme
+      type: string
+    - name: amplicon_size
+      type: string
+    - name: sequencing_instrument
+      type: string
+    - name: sequencing_protocol
+      type: string
+    - name: sequencing_assay_type
+      type: string
+    - name: sequenced_by_organization
+      type: string
+    - name: sequenced_by_contact_name
+      type: string
+    - name: sequenced_by_contact_email
+      type: string
+    - name: raw_sequence_data_processing_method
+      type: string
+    - name: dehosting_method
+      type: string
+    - name: reference_genome_accession
+      type: string
+    - name: consensus_sequence_software_name
+      type: string
+    - name: consensus_sequence_software_version
+      type: string
+    - name: depth_of_coverage
+      type: int
+    - name: breadth_of_coverage
+      type: int
+    - name: quality_control_method_name
+      type: string
+    - name: quality_control_method_version
+      type: string
+    - name: quality_control_determination
+      type: string
+    - name: quality_control_issues
+      type: string
+    - name: quality_control_details
+      type: string
+    - name: diagnostic_measurement_method
+      type: string
+    - name: diagnostic_target_presence
+      type: string
+    - name: diagnostic_target_gene_name
+      type: string
+    - name: diagnostic_measurement_value
+      type: string
+    - name: diagnostic_measurement_unit
+      type: string
+    - name: ncbi_completeness
+      type: string
+      generateIndex: true
+    - name: length
+      type: int
+    - name: host_name_scientific
+      type: string
+      generateIndex: true
+    - name: host_name_common
+      type: string
+      generateIndex: true
+    - name: host_taxon_id
+      type: int
+    - name: is_lab_host
+      type: string
+      generateIndex: true
+    - name: cell_line
+      type: string
+      generateIndex: true
+    - name: passage_number
+      type: int
+    - name: passage_method
+      type: string
+      generateIndex: true
+    - name: ncbi_protein_count
+      type: int
+    - name: ncbi_sourcedb
+      type: string
+      generateIndex: true
+    - name: ncbi_virus_name
+      type: string
+      generateIndex: true
+    - name: ncbi_virus_tax_id
+      type: int
+    - name: sra_run_accession
+      type: string
+    - name: total_snps
+      type: int
+    - name: lineage
+      type: string
+      generateIndex: true
+    - name: total_inserted_nucs
+      type: int
+    - name: total_deleted_nucs
+      type: int
+    - name: total_ambiguous_nucs
+      type: int
+    - name: total_unknown_nucs
+      type: int
+    - name: total_frame_shifts
+      type: int
+    - name: frame_shifts
+      type: string
+    - name: completeness
+      type: float
+    - name: total_stop_codons
+      type: int
+    - name: stop_codons
+      type: string
+  primaryKey: insdc_accession_full
+  dateToSortBy: sample_collection_date
diff --git a/testBaseData/mediumSizedRsvDataset/preprocessing_config.yaml b/testBaseData/mediumSizedRsvDataset/preprocessing_config.yaml
@@ -0,0 +1,2 @@
+ndjsonInputFilename: "data.ndjson.zst"
+referenceGenomeFilename: "reference_genomes.json"
diff --git a/testBaseData/mediumSizedRsvDataset/reference_genomes.json b/testBaseData/mediumSizedRsvDataset/reference_genomes.json
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ndjsonInputFilename: "data.ndjson.zst"
		referenceGenomeFilename: "reference_genomes.json"