Skip to content

Commit

Permalink
Fix ref base lower case comparison issue
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Jul 17, 2024
1 parent 7c49b5e commit 1e60e8d
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 27 deletions.
10 changes: 5 additions & 5 deletions lib/genesis/population/format/frequency_table_input_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -955,9 +955,9 @@ void FrequencyTableInputStream::Iterator::increment_()
);

// Both ref genome and ref column are given and have a usable value. Try to match them.
if( header_info_.has_ref && current_variant_->reference_base != 'N' ) {
if( header_info_.has_ref && utils::to_upper( current_variant_->reference_base ) != 'N' ) {
// Get a shorthand, and check the bases that the processor allows.
auto const ref_base = current_variant_->reference_base;
auto const ref_base = utils::to_upper( current_variant_->reference_base );
assert( is_valid_base( ref_base ));

// Both are given and the base from the file is not 'N', so let's see if they agree.
Expand All @@ -982,7 +982,7 @@ void FrequencyTableInputStream::Iterator::increment_()
// or it's 'N', so that we might want to replace it by the ref genome. Both cases are
// treated the same here: Check that we can use the genome base, or use N if not.
if( is_valid_base( ref_gen_base )) {
current_variant_->reference_base = ref_gen_base;
current_variant_->reference_base = utils::to_upper( ref_gen_base );
} else {
current_variant_->reference_base = 'N';
}
Expand Down Expand Up @@ -1189,8 +1189,8 @@ void FrequencyTableInputStream::Iterator::process_sample_data_(

// Now store the counts in the sample, using the ref/alt base info if available,
// or fixed bases if ref and/or alt are not available.
char ref_base = variant.reference_base;
char alt_base = variant.alternative_base;
char ref_base = utils::to_upper( variant.reference_base );
char alt_base = utils::to_upper( variant.alternative_base );
assert( is_valid_base_or_n( ref_base ));
assert( is_valid_base_or_n( alt_base ));
if( utils::char_match_ci( ref_base, 'N' )) {
Expand Down
11 changes: 7 additions & 4 deletions lib/genesis/population/format/vcf_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "genesis/population/function/functions.hpp"
#include "genesis/population/sample_counts.hpp"
#include "genesis/population/variant.hpp"
#include "genesis/utils/text/char.hpp"

extern "C" {
#include <htslib/hts.h>
Expand Down Expand Up @@ -409,8 +410,9 @@ Variant convert_to_variant_as_pool( VcfRecord const& record )
Variant result;
result.chromosome = record.get_chromosome();
result.position = record.get_position();
result.reference_base = snp_chars.first[0];
result.alternative_base = snp_chars.first[1]; // TODO this is only reasonable for biallelic SNPs
result.reference_base = utils::to_upper( snp_chars.first[0] );
result.alternative_base = utils::to_upper( snp_chars.first[1] );
// TODO the alt base above is only reasonable for biallelic SNPs

// Process the sample AD counts that are present in the VCF record line.
result.samples.reserve( record.header().get_sample_count() );
Expand Down Expand Up @@ -481,8 +483,9 @@ Variant convert_to_variant_as_individuals(
// Prepare common fields of the result. Same as convert_to_variant_as_pool(), see there.
result.chromosome = record.get_chromosome();
result.position = record.get_position();
result.reference_base = snp_chars.first[0];
result.alternative_base = snp_chars.first[1]; // TODO this is only reasonable for biallelic SNPs
result.reference_base = utils::to_upper( snp_chars.first[0] );
result.alternative_base = utils::to_upper( snp_chars.first[1] );
// TODO the alt base above is only reasonable for biallelic SNPs

// We merge everything into one sample, representing the individuals as a pool.
result.samples.resize( 1 );
Expand Down
16 changes: 10 additions & 6 deletions lib/genesis/population/function/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -573,16 +573,18 @@ void guess_and_set_ref_and_alt_bases(
if( variant.position == 0 ) {
throw std::runtime_error( "Invalid position 0 in Variant." );
}
ref_base = utils::to_upper( ref_base );

// Now use that reference base. If it is in ACGT, we use it as ref; if not, we check against
// ambiguity codes to see if it fits with our count-based ref and alt bases instead.
if( is_valid_base( ref_base )) {
if( variant.reference_base != 'N' && variant.reference_base != ref_base ) {
auto const var_ref_base = utils::to_upper( variant.reference_base );
if( var_ref_base != 'N' && var_ref_base != ref_base ) {
throw std::runtime_error(
"At chromosome \"" + variant.chromosome + "\" position " +
std::to_string( variant.position ) + ", the Reference Genome has base '" +
std::string( 1, ref_base ) + "', while the Variant already has mismatching base '" +
std::string( 1, variant.reference_base ) + "' set"
std::string( 1, var_ref_base ) + "' set"
);
}

Expand All @@ -593,6 +595,8 @@ void guess_and_set_ref_and_alt_bases(
} else {
// No usable ref base. Run the normal guessing.
guess_and_set_ref_and_alt_bases( variant, force, filter_policy );
auto const var_ref_base = utils::to_upper( variant.reference_base );
auto const var_alt_base = utils::to_upper( variant.alternative_base );

// Now we cross check that the ref genome base is a valid base,
// and also that it is an ambiguity char that contains either the ref or alt that we found.
Expand All @@ -601,8 +605,8 @@ void guess_and_set_ref_and_alt_bases(
bool contains = false;
try {
using genesis::sequence::nucleic_acid_code_containment;
contains |= nucleic_acid_code_containment( ref_base, variant.reference_base );
contains |= nucleic_acid_code_containment( ref_base, variant.alternative_base );
contains |= nucleic_acid_code_containment( ref_base, var_ref_base );
contains |= nucleic_acid_code_containment( ref_base, var_alt_base );
} catch(...) {
// The above throws an error if the given bases are not valid.
// Catch this, and re-throw a nicer, more understandable exception instead.
Expand All @@ -616,8 +620,8 @@ void guess_and_set_ref_and_alt_bases(
throw std::runtime_error(
"At chromosome \"" + variant.chromosome + "\" position " +
std::to_string( variant.position ) + ", the reference base is '" +
std::string( 1, variant.reference_base ) + "' and the alternative base is '" +
std::string( 1, variant.alternative_base ) +
std::string( 1, var_ref_base ) + "' and the alternative base is '" +
std::string( 1, var_alt_base ) +
"', determined from nucleotide counts in the data at this position. " +
"However, the Reference Genome has base '" + std::string( 1, ref_base ) +
"', which does not code for either of them, " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/population/function/functions.hpp"
#include "genesis/sequence/functions/codes.hpp"
#include "genesis/utils/text/char.hpp"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -499,14 +500,14 @@ void VariantGaplessInputStream::Iterator::prepare_current_variant_ref_base_()
current_locus_.position > 0 &&
current_locus_.position <= ref_genome_it_->length()
);
auto const ref_base = ref_genome_it_->site_at( current_locus_.position - 1 );
auto const ref_base = utils::to_upper( ref_genome_it_->site_at( current_locus_.position - 1 ));
// guess_and_set_ref_and_alt_bases( cur_var, ref_base, true );
if( is_valid_base( cur_var.reference_base )) {
if( is_valid_base( utils::to_upper( cur_var.reference_base ))) {
bool contains = false;
try {
using genesis::sequence::nucleic_acid_code_containment;
contains = nucleic_acid_code_containment(
ref_base, cur_var.reference_base
ref_base, utils::to_upper( cur_var.reference_base )
);
} catch(...) {
// The above throws an error if the given bases are not valid.
Expand All @@ -529,7 +530,7 @@ void VariantGaplessInputStream::Iterator::prepare_current_variant_ref_base_()
);
}
} else {
cur_var.reference_base = ref_base;
cur_var.reference_base = utils::to_upper( ref_base );
}
}

Expand Down
17 changes: 9 additions & 8 deletions lib/genesis/population/stream/variant_parallel_input_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
#include "genesis/population/filter/sample_counts_filter.hpp"
#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/utils/core/logging.hpp"
#include "genesis/utils/text/char.hpp"

#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -183,38 +184,38 @@ Variant VariantParallelInputStream::Iterator::joined_variant(
// Set and check the ref and alt bases.
// This is the first input that has data here. Use it to initialize the bases.
if( ! bases_init ) {
res.reference_base = variants_[i]->reference_base;
res.alternative_base = variants_[i]->alternative_base;
res.reference_base = utils::to_upper( variants_[i]->reference_base );
res.alternative_base = utils::to_upper( variants_[i]->alternative_base );
bases_init = true;
}

// Now check that all inputs have the same bases. We however overwrite any input
// that has an N with an input that does not have N, to get the best data.
if( res.reference_base != variants_[i]->reference_base ) {
if( res.reference_base != utils::to_upper( variants_[i]->reference_base )) {
if( res.reference_base == 'N' ) {
res.reference_base = variants_[i]->reference_base;
res.reference_base = utils::to_upper( variants_[i]->reference_base );
} else if( params.allow_ref_base_mismatches ) {
res.reference_base = 'N';
} else {
throw std::runtime_error(
"Mismatching reference bases while iterating input sources in parallel at " +
to_string( current_locus_ ) + ". Some sources have base '" +
std::string( 1, res.reference_base ) + "' while others have '" +
std::string( 1, variants_[i]->reference_base ) + "'."
std::string( 1, utils::to_upper( variants_[i]->reference_base )) + "'."
);
}
}
if( res.alternative_base != variants_[i]->alternative_base ) {
if( res.alternative_base != utils::to_upper( variants_[i]->alternative_base )) {
if( res.alternative_base == 'N' ) {
res.alternative_base = variants_[i]->alternative_base;
res.alternative_base = utils::to_upper( variants_[i]->alternative_base );
} else if( params.allow_alt_base_mismatches ) {
res.alternative_base = 'N';
} else {
throw std::runtime_error(
"Mismatching alternative bases while iterating input sources in parallel at " +
to_string( current_locus_ ) + ". Some sources have base '" +
std::string( 1, res.alternative_base ) + "' while others have '" +
std::string( 1, variants_[i]->alternative_base ) + "'."
std::string( 1, utils::to_upper( variants_[i]->alternative_base )) + "'."
);
}
}
Expand Down

0 comments on commit 1e60e8d

Please sign in to comment.