Skip to content

Commit

Permalink
fix: don't abort when reading table in chunks
Browse files Browse the repository at this point in the history
The bug was introduced in #483 and only occured when the table reader tried to read the second chunk.
Thus: also add a preprocessor test with a dataset that is large enough so that the bug occurs.
  • Loading branch information
fengelniederhammer committed Jun 25, 2024
1 parent e012211 commit 26b1558
Show file tree
Hide file tree
Showing 7 changed files with 334 additions and 7 deletions.
3 changes: 1 addition & 2 deletions src/silo/common/table_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,7 @@ size_t silo::TableReader::read() {
for (size_t row_in_chunk = 0; row_in_chunk < current_chunk_size; row_in_chunk++) {
column_functions.at(column_idx)
.function(
current_start_of_chunk + row_in_chunk,
column.GetValue(current_start_of_chunk + row_in_chunk)
current_start_of_chunk + row_in_chunk, column.GetValue(row_in_chunk)
);
}
}
Expand Down
25 changes: 20 additions & 5 deletions src/silo/preprocessing/preprocessor.test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,7 @@ const Scenario NO_NUCLEOTIDE_SEQUENCES = {
}
}
)",
.expected_query_result = nlohmann::json::parse(R"(
[{"count":30}])")
.expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
};

const Scenario NO_SEQUENCES = {
Expand All @@ -210,8 +209,23 @@ const Scenario NO_SEQUENCES = {
}
}
)",
.expected_query_result = nlohmann::json::parse(R"(
[{"count":30}])")
.expected_query_result = nlohmann::json::parse(R"([{"count":30}])")
};

const Scenario MEDIUM_SIZED_RSV_DATASET = {
.input_directory = "testBaseData/mediumSizedRsvDataset/",
.expected_sequence_count = 19662,
.query = R"(
{
"action": {
"type": "Aggregated"
},
"filterExpression": {
"type": "True"
}
}
)",
.expected_query_result = nlohmann::json::parse(R"([{"count":19662}])")
};

class PreprocessorTestFixture : public ::testing::TestWithParam<Scenario> {};
Expand All @@ -230,7 +244,8 @@ INSTANTIATE_TEST_SUITE_P(
EMPTY_INPUT_NDJSON_UNPARTITIONED,
NO_GENES,
NO_NUCLEOTIDE_SEQUENCES,
NO_SEQUENCES
NO_SEQUENCES,
MEDIUM_SIZED_RSV_DATASET
),
printTestName
);
Expand Down
11 changes: 11 additions & 0 deletions src/silo/storage/column_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <vector>

#include <fmt/format.h>
#include <spdlog/spdlog.h>
#include <boost/algorithm/string.hpp>
#include <duckdb.hpp>

Expand Down Expand Up @@ -65,6 +66,10 @@ void ColumnPartitionGroup::addValueToColumn(
float_columns.at(column_name).insert(value.ToString());
return;
}
SPDLOG_ERROR(
"Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
);
abort();
}

void ColumnPartitionGroup::addNullToColumn(const std::string& column_name, ColumnType column_type) {
Expand All @@ -91,6 +96,9 @@ void ColumnPartitionGroup::addNullToColumn(const std::string& column_name, Colum
float_columns.at(column_name).insertNull();
return;
}
SPDLOG_ERROR(
"Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
);
abort();
}

Expand Down Expand Up @@ -122,6 +130,9 @@ void ColumnPartitionGroup::reserveSpaceInColumn(
float_columns.at(column_name).reserve(row_count);
return;
}
SPDLOG_ERROR(
"Unknown column type '{}' for column '{}'", static_cast<int>(column_type), column_name
);
abort();
}

Expand Down
Binary file not shown.
254 changes: 254 additions & 0 deletions testBaseData/mediumSizedRsvDataset/database_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
schema:
instanceName: "RSV A"
opennessLevel: OPEN
metadata:
- name: accession
type: string
- name: version
type: int
- name: submissionId
type: string
- name: accessionVersion
type: string
- name: isRevocation
type: boolean
- name: submitter
type: string
generateIndex: true
- name: groupId
type: int
- name: groupName
type: string
generateIndex: true
- name: submittedAt
type: int
- name: releasedAt
type: int
- name: dataUseTerms
type: string
generateIndex: true
- name: dataUseTermsRestrictedUntil
type: date
- name: versionStatus
type: string
- name: sample_collection_date
type: date
- name: ncbi_release_date
type: date
- name: ncbi_update_date
type: date
- name: geo_loc_country
type: string
generateIndex: true
- name: geo_loc_admin_1
type: string
generateIndex: true
- name: geo_loc_admin_2
type: string
generateIndex: true
- name: geo_loc_city
type: string
generateIndex: true
- name: geo_loc_site
type: string
- name: specimen_collector_sample_id
type: string
- name: authors
type: string
- name: author_affiliations
type: string
generateIndex: true
- name: ncbi_submitter_country
type: string
generateIndex: true
- name: insdc_accession_base
type: string
- name: insdc_version
type: int
- name: insdc_accession_full
type: string
- name: bioproject_accessions
type: string
- name: biosample_accession
type: string
- name: culture_id
type: string
- name: sample_received_date
type: date
- name: sample_type
type: string
- name: purpose_of_sampling
type: string
- name: presampling_activity
type: string
- name: anatomical_material
type: string
- name: anatomical_part
type: string
- name: body_product
type: string
- name: environmental_material
type: string
- name: environmental_site
type: string
- name: collection_device
type: string
- name: collection_method
type: string
- name: food_product
type: string
- name: food_product_properties
type: string
- name: specimen_processing
type: string
- name: specimen_processing_details
type: string
- name: experimental_specimen_role_type
type: string
- name: host_age
type: int
- name: host_age_bin
type: string
- name: host_gender
type: string
- name: host_origin_country
type: string
- name: host_disease
type: string
- name: signs_and_symptoms
type: string
- name: host_health_state
type: string
- name: host_health_outcome
type: string
- name: travel_history
type: string
- name: exposure_event
type: string
- name: host_role
type: string
- name: exposure_setting
type: string
- name: exposure_details
type: string
- name: previous_infection_disease
type: string
- name: previous_infection_organism
type: string
- name: host_vaccination_status
type: string
- name: purpose_of_sequencing
type: string
- name: sequencing_date
type: date
- name: amplicon_pcr_primer_scheme
type: string
- name: amplicon_size
type: string
- name: sequencing_instrument
type: string
- name: sequencing_protocol
type: string
- name: sequencing_assay_type
type: string
- name: sequenced_by_organization
type: string
- name: sequenced_by_contact_name
type: string
- name: sequenced_by_contact_email
type: string
- name: raw_sequence_data_processing_method
type: string
- name: dehosting_method
type: string
- name: reference_genome_accession
type: string
- name: consensus_sequence_software_name
type: string
- name: consensus_sequence_software_version
type: string
- name: depth_of_coverage
type: int
- name: breadth_of_coverage
type: int
- name: quality_control_method_name
type: string
- name: quality_control_method_version
type: string
- name: quality_control_determination
type: string
- name: quality_control_issues
type: string
- name: quality_control_details
type: string
- name: diagnostic_measurement_method
type: string
- name: diagnostic_target_presence
type: string
- name: diagnostic_target_gene_name
type: string
- name: diagnostic_measurement_value
type: string
- name: diagnostic_measurement_unit
type: string
- name: ncbi_completeness
type: string
generateIndex: true
- name: length
type: int
- name: host_name_scientific
type: string
generateIndex: true
- name: host_name_common
type: string
generateIndex: true
- name: host_taxon_id
type: int
- name: is_lab_host
type: string
generateIndex: true
- name: cell_line
type: string
generateIndex: true
- name: passage_number
type: int
- name: passage_method
type: string
generateIndex: true
- name: ncbi_protein_count
type: int
- name: ncbi_sourcedb
type: string
generateIndex: true
- name: ncbi_virus_name
type: string
generateIndex: true
- name: ncbi_virus_tax_id
type: int
- name: sra_run_accession
type: string
- name: total_snps
type: int
- name: lineage
type: string
generateIndex: true
- name: total_inserted_nucs
type: int
- name: total_deleted_nucs
type: int
- name: total_ambiguous_nucs
type: int
- name: total_unknown_nucs
type: int
- name: total_frame_shifts
type: int
- name: frame_shifts
type: string
- name: completeness
type: float
- name: total_stop_codons
type: int
- name: stop_codons
type: string
primaryKey: insdc_accession_full
dateToSortBy: sample_collection_date
2 changes: 2 additions & 0 deletions testBaseData/mediumSizedRsvDataset/preprocessing_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ndjsonInputFilename: "data.ndjson.zst"
referenceGenomeFilename: "reference_genomes.json"
46 changes: 46 additions & 0 deletions testBaseData/mediumSizedRsvDataset/reference_genomes.json

Large diffs are not rendered by default.

0 comments on commit 26b1558

Please sign in to comment.