From bf9100362bce4dc27979111f1421e23dbf60885b Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Mon, 3 Jun 2024 20:43:41 +0200 Subject: [PATCH 1/4] Renam window average option Absolute Sum to just Sum --- lib/genesis/population/function/window_average.hpp | 4 ++-- test/src/population/fst_pool.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/genesis/population/function/window_average.hpp b/lib/genesis/population/function/window_average.hpp index c9b95eb6..6144ef8e 100644 --- a/lib/genesis/population/function/window_average.hpp +++ b/lib/genesis/population/function/window_average.hpp @@ -116,7 +116,7 @@ enum class WindowAveragePolicy /** * @brief Simply report the total sum, with no averaging, i.e., the absolute value of the metric. */ - kAbsoluteSum + kSum }; /** @@ -172,7 +172,7 @@ inline double window_average_denominator( case WindowAveragePolicy::kValidSnps: { return sample_counts_filter_stats[SampleCountsFilterTag::kPassed]; } - case WindowAveragePolicy::kAbsoluteSum: { + case WindowAveragePolicy::kSum: { return 1.0; } default: { diff --git a/test/src/population/fst_pool.cpp b/test/src/population/fst_pool.cpp index 42b72544..3731607e 100644 --- a/test/src/population/fst_pool.cpp +++ b/test/src/population/fst_pool.cpp @@ -402,7 +402,7 @@ TEST( FST, FstPoolProcessor ) // Make an FST processor for the two samples. std::vector const poolsizes{ 100, 100 }; auto processor = make_fst_pool_processor( - WindowAveragePolicy::kAbsoluteSum, poolsizes + WindowAveragePolicy::kSum, poolsizes ); ASSERT_EQ( 1, processor.size() ); processor.thread_pool( Options::get().global_thread_pool() ); From afe5765ad3d43a98c9276887f6d0eea8425ae550 Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Tue, 4 Jun 2024 10:51:22 +0200 Subject: [PATCH 2/4] Deactivate MacOS gcc CI test that fails due to GitHub Actions issues --- .github/workflows/ci.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index bb533243..de97a4eb 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -109,11 +109,14 @@ jobs: - os: ubuntu-22.04 compiler: gcc-8 - # Same on MacOS now... + # Older GCC versions are also not supported by MacOS 12 any more. + # Also excluding a broken current one for now... Fix later! - os: macos-12 compiler: gcc-7 - os: macos-12 compiler: gcc-8 + - os: macos-12 + compiler: gcc-13 # llvm-9 causes weird segfauls on Ubuntu, which do not seem to be our fault. # Need to investiage further at some point. We used setup-cpp locally to install From cdccf247bde48c8e7b4abeeb2c5504c33e24e58d Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Tue, 4 Jun 2024 13:47:45 +0200 Subject: [PATCH 3/4] Minor refinement in Variant Gapless Stream --- .../population/stream/variant_gapless_input_stream.cpp | 10 ++++++---- .../population/stream/variant_gapless_input_stream.hpp | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/genesis/population/stream/variant_gapless_input_stream.cpp b/lib/genesis/population/stream/variant_gapless_input_stream.cpp index 4c2f36cd..979aafc5 100644 --- a/lib/genesis/population/stream/variant_gapless_input_stream.cpp +++ b/lib/genesis/population/stream/variant_gapless_input_stream.cpp @@ -99,7 +99,7 @@ VariantGaplessInputStream::Iterator::Iterator( VariantGaplessInputStream* parent // If we are here, we have initialized the current locus to the first position on some valid // chromosome, and we can start the processing. assert( current_locus_.chromosome != "" && current_locus_.position != 0 ); - start_chromosome_(); + init_chromosome_(); // We have just initialized the chrosome, including all cache pointer for the given references. // We now use that to check that the position where we started is actually covered by the @@ -147,7 +147,7 @@ void VariantGaplessInputStream::Iterator::advance_() // If the next position is the start of a chromosome, // we need to set it up correctly first. if( current_locus_.position == 1 ) { - start_chromosome_(); + init_chromosome_(); } } while( ! current_locus_is_covered_by_genome_locus_set_() ); @@ -157,10 +157,10 @@ void VariantGaplessInputStream::Iterator::advance_() } // ------------------------------------------------------------------------- -// start_chromosome_ +// init_chromosome_ // ------------------------------------------------------------------------- -void VariantGaplessInputStream::Iterator::start_chromosome_() +void VariantGaplessInputStream::Iterator::init_chromosome_() { // Check that we are not done yet (parent still valid), and that we have either // a ref genome or a seq dict, but not both (neither is also fine). @@ -554,6 +554,8 @@ void VariantGaplessInputStream::Iterator::check_input_iterator_() bool VariantGaplessInputStream::Iterator::current_locus_is_covered_by_genome_locus_set_() { assert( parent_ ); + assert( current_locus_.chromosome != "" ); + assert( current_locus_.position > 0 ); // Without a given genome locus set, we always consider this position to be covered. if( ! parent_->genome_locus_set_ ) { diff --git a/lib/genesis/population/stream/variant_gapless_input_stream.hpp b/lib/genesis/population/stream/variant_gapless_input_stream.hpp index c4f3e7f0..8b998f7c 100644 --- a/lib/genesis/population/stream/variant_gapless_input_stream.hpp +++ b/lib/genesis/population/stream/variant_gapless_input_stream.hpp @@ -241,7 +241,7 @@ class VariantGaplessInputStream * It then does some consistenty checks and sets up the ref genome or seq dict for the new * chromosome. */ - void start_chromosome_(); + void init_chromosome_(); /** * @brief Find the next locus to process. From ddfb3fbb9fee5eab1ec95f273eaec9116ecda2db Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Tue, 4 Jun 2024 19:00:26 +0200 Subject: [PATCH 4/4] Redesign FST window averaging implementation --- .../population/function/fst_cathedral.cpp | 17 +- .../function/fst_pool_functions.hpp | 16 +- .../function/fst_pool_processor.hpp | 102 +++--- .../population/function/fst_pool_unbiased.hpp | 345 +++++++++++------- .../population/function/window_average.hpp | 9 +- test/src/population/fst_pool.cpp | 25 +- 6 files changed, 310 insertions(+), 204 deletions(-) diff --git a/lib/genesis/population/function/fst_cathedral.cpp b/lib/genesis/population/function/fst_cathedral.cpp index e32249c2..ab8f942d 100644 --- a/lib/genesis/population/function/fst_cathedral.cpp +++ b/lib/genesis/population/function/fst_cathedral.cpp @@ -30,6 +30,7 @@ #include "genesis/population/function/fst_cathedral.hpp" +#include "genesis/population/function/window_average.hpp" #include "genesis/utils/formats/json/document.hpp" #include @@ -183,23 +184,27 @@ void fill_fst_cathedral_records_from_processor_( // Bit hacky, but good enough for now. Then, store the results. assert( processor.size() == records.size() ); for( size_t i = 0; i < processor.size(); ++i ) { - auto& raw_calc = processor.calculators()[i]; - auto fst_calc = dynamic_cast( raw_calc.get() ); + auto const& raw_calc = processor.calculators()[i]; + auto const* fst_calc = dynamic_cast( raw_calc.get() ); if( ! fst_calc ) { throw std::runtime_error( "In compute_fst_cathedral_records_for_chromosome(): " "Invalid FstPoolCalculator that is not FstPoolCalculatorUnbiased" ); } + if( fst_calc->get_window_average_policy() != WindowAveragePolicy::kSum ) { + throw std::runtime_error( + "In compute_fst_cathedral_records_for_chromosome(): " + "Invalid FstPoolCalculator that is not using WindowAveragePolicy::kSum" + ); + } // Now add the entry for the current calculator to its respective records entry. // We rely on the amortized complexity here - cannot pre-allocate the size, // as we do not know how many positions will actually be in the input beforehand. + auto const pis = fst_calc->get_pi_values( 0, processor.get_filter_stats() ); records[i].entries.emplace_back( - position, - fst_calc->get_pi_within(), - fst_calc->get_pi_between(), - fst_calc->get_pi_total() + position, pis.pi_within, pis.pi_between, pis.pi_total ); } } diff --git a/lib/genesis/population/function/fst_pool_functions.hpp b/lib/genesis/population/function/fst_pool_functions.hpp index 82d78644..1e774288 100644 --- a/lib/genesis/population/function/fst_pool_functions.hpp +++ b/lib/genesis/population/function/fst_pool_functions.hpp @@ -31,11 +31,16 @@ * @ingroup population */ +#include "genesis/population/filter/filter_stats.hpp" +#include "genesis/population/filter/filter_status.hpp" +#include "genesis/population/filter/sample_counts_filter.hpp" +#include "genesis/population/filter/variant_filter.hpp" #include "genesis/population/function/fst_pool_karlsson.hpp" #include "genesis/population/function/fst_pool_kofler.hpp" -#include "genesis/population/function/fst_pool_unbiased.hpp" #include "genesis/population/function/fst_pool_processor.hpp" +#include "genesis/population/function/fst_pool_unbiased.hpp" #include "genesis/population/function/functions.hpp" +#include "genesis/population/function/window_average.hpp" #include "genesis/population/variant.hpp" #include "genesis/utils/containers/matrix.hpp" #include "genesis/utils/containers/transform_iterator.hpp" @@ -341,7 +346,10 @@ std::pair f_st_pool_unbiased( } // Init the calculator. - FstPoolCalculatorUnbiased calc{ p1_poolsize, p2_poolsize }; + // For simplicity in this wrapper, we only allow to normalize the pi values per window + // via their sum; in this function, we do not have the additionally needed information + // on the Variant Filter Status statistics anyway. If that is needed, use FstPoolProcessor. + FstPoolCalculatorUnbiased calc{ p1_poolsize, p2_poolsize, WindowAveragePolicy::kSum }; // Iterate the two ranges in parallel. Each iteration is one position in the genome. // In each iteration, p1_it and p2_it point at SampleCounts objects containing nucleotide counts. @@ -362,7 +370,9 @@ std::pair f_st_pool_unbiased( ); } - return calc.get_result_pair(); + // Unfortunately, we need dummies here now for the window based counters. They are not used + // with the above WindowAveragePolicy::kSum policy, but need to be provided nonetheless... + return calc.get_result_pair( 0, VariantFilterStats{} ); } #if __cplusplus >= 201402L diff --git a/lib/genesis/population/function/fst_pool_processor.hpp b/lib/genesis/population/function/fst_pool_processor.hpp index 8030f5ae..37e7a9d0 100644 --- a/lib/genesis/population/function/fst_pool_processor.hpp +++ b/lib/genesis/population/function/fst_pool_processor.hpp @@ -76,16 +76,6 @@ class FstPoolProcessor // Constructors and Rule of Five // ------------------------------------------------------------------------- - /** - * @brief Default constructor. - * - * We always want to make sure that the user provides a WindowAveragePolicy, so using this - * default constructor leads to an unusable instance. We provide it so that dummy processors - * can be constructed, but they have to be replaced by non-default-constructed instances - * befor usage. - */ - FstPoolProcessor() = default; - /** * @brief Construct a processor. * @@ -94,14 +84,11 @@ class FstPoolProcessor * is to be used, deactivate by explicitly setting the thread_pool() function here to `nullptr`. */ FstPoolProcessor( - WindowAveragePolicy window_average_policy, std::shared_ptr thread_pool = nullptr, size_t threading_threshold = 4096 ) - : avg_policy_( window_average_policy ) - , thread_pool_( thread_pool ) + : thread_pool_( thread_pool ) , threading_threshold_( threading_threshold ) - , is_default_constructed_( false ) { thread_pool_ = thread_pool ? thread_pool : utils::Options::get().global_thread_pool(); } @@ -189,6 +176,7 @@ class FstPoolProcessor void reset() { + assert( calculators_.size() == results_.size() ); for( auto& calc : calculators_ ) { assert( calc ); calc->reset(); @@ -201,6 +189,10 @@ class FstPoolProcessor // Also reset the pi vectors to nan. // If they are not allocated, nothing happens. + auto const res_sz = results_.size(); + assert( std::get<0>( results_pi_ ).size() == 0 || std::get<0>( results_pi_ ).size() == res_sz ); + assert( std::get<1>( results_pi_ ).size() == 0 || std::get<1>( results_pi_ ).size() == res_sz ); + assert( std::get<2>( results_pi_ ).size() == 0 || std::get<2>( results_pi_ ).size() == res_sz ); std::fill( std::get<0>( results_pi_ ).begin(), std::get<0>( results_pi_ ).end(), std::numeric_limits::quiet_NaN() @@ -219,9 +211,6 @@ class FstPoolProcessor { // Check correct usage assert( sample_pairs_.size() == calculators_.size() ); - if( is_default_constructed_ ) { - throw std::domain_error( "Cannot use a default constructed FstPoolProcessor" ); - } // Only process Variants that are passing, but keep track of the ones that did not. ++filter_stats_[variant.status.get()]; @@ -272,11 +261,19 @@ class FstPoolProcessor { assert( results_.size() == calculators_.size() ); for( size_t i = 0; i < results_.size(); ++i ) { - auto const abs_fst = calculators_[i]->get_result(); - auto const window_avg_denom = window_average_denominator( - avg_policy_, window_length, filter_stats_, calculators_[i]->get_filter_stats() - ); - results_[i] = abs_fst / window_avg_denom; + // We do an ugly dispatch here to treat the special case of the FstPoolCalculatorUnbiased + // class, which needs additional information on the window in order to normalize + // the pi values correctly. The Kofler and Karlsson do not need that, and we want to + // avoid using dummies in these places. So instead, we just do a dispatch here. + // If in the future more calculators are added that need special behaviour, + // we might want to redesign this... + auto const* raw_calc = calculators_[i].get(); + auto const* unbiased_calc = dynamic_cast( raw_calc ); + if( unbiased_calc ) { + results_[i] = unbiased_calc->get_result( window_length, filter_stats_ ); + } else { + results_[i] = raw_calc->get_result(); + } } return results_; } @@ -285,9 +282,9 @@ class FstPoolProcessor * @brief Get lists of all the three intermediate pi values (within, between, total) that * are part of our unbiased estimators. * - * This computes the window-averaged values for pi within, pi between, and pi total (in that - * order in the tuple), for each sample pair (order in the three vectors). This uses the same - * window averaging policy as the get_result() function. + * This computes the window-average-corrected values for pi within, pi between, and pi total + * (in that order in the tuple), for each sample pair (order in the three vectors). + * This uses the same window averaging policy as the get_result() function. * * This only works when all calculators are of type FstPoolCalculatorUnbiased, and throws an * exception otherwise. It is merely meant as a convenience function for that particular case. @@ -296,31 +293,32 @@ class FstPoolProcessor { // Only allocate when someone first calls this. // Does not do anything afterwards. - auto const result_size = calculators_.size(); - std::get<0>( results_pi_ ).resize( result_size ); - std::get<1>( results_pi_ ).resize( result_size ); - std::get<2>( results_pi_ ).resize( result_size ); + auto const res_sz = calculators_.size(); + assert( std::get<0>( results_pi_ ).size() == 0 || std::get<0>( results_pi_ ).size() == res_sz ); + assert( std::get<1>( results_pi_ ).size() == 0 || std::get<1>( results_pi_ ).size() == res_sz ); + assert( std::get<2>( results_pi_ ).size() == 0 || std::get<2>( results_pi_ ).size() == res_sz ); + std::get<0>( results_pi_ ).resize( res_sz ); + std::get<1>( results_pi_ ).resize( res_sz ); + std::get<2>( results_pi_ ).resize( res_sz ); // Get the pi values from all calculators, assuming that they are of the correct type. - for( size_t i = 0; i < result_size; ++i ) { - auto& raw_calc = calculators_[i]; - auto cast_calc = dynamic_cast( raw_calc.get() ); - if( ! cast_calc ) { + for( size_t i = 0; i < res_sz; ++i ) { + auto const& raw_calc = calculators_[i]; + auto const* unbiased_calc = dynamic_cast( raw_calc.get() ); + if( ! unbiased_calc ) { throw std::domain_error( "Can only call FstPoolProcessor::get_pi_vectors() " "for calculators of type FstPoolCalculatorUnbiased." ); } - // Get the denominator to use for all averaging. - auto const window_avg_denom = window_average_denominator( - avg_policy_, window_length, filter_stats_, calculators_[i]->get_filter_stats() - ); - - // We compute the window-averaged values as above. - std::get<0>(results_pi_)[i] = cast_calc->get_pi_within() / window_avg_denom; - std::get<1>(results_pi_)[i] = cast_calc->get_pi_between() / window_avg_denom; - std::get<2>(results_pi_)[i] = cast_calc->get_pi_total() / window_avg_denom; + // We compute the window-averaged values here. + // Unfortunately, we need to copy this value-by-value, as we want to return + // three independent vectors for user convenienec on the caller's end. + auto const pis = unbiased_calc->get_pi_values( window_length, filter_stats_ ); + std::get<0>(results_pi_)[i] = pis.pi_within; + std::get<1>(results_pi_)[i] = pis.pi_between; + std::get<2>(results_pi_)[i] = pis.pi_total; } return results_pi_; @@ -354,10 +352,6 @@ class FstPoolProcessor private: - // We force the correct usage of the window averaging policy here, - // so that we make misinterpretation of the values less likely. - WindowAveragePolicy avg_policy_; - // The pairs of sample indices of the variant between which we want to compute FST, // and the processors to use for these computations, std::vector> sample_pairs_; @@ -376,10 +370,6 @@ class FstPoolProcessor // (number of sample pairs) at which we start using the thread pool. std::shared_ptr thread_pool_; size_t threading_threshold_ = 0; - - // We want to make sure to disallow default constructed instances. - // Bit ugly to do it this way, but works for now. - bool is_default_constructed_ = true; }; // ================================================================================================= @@ -396,11 +386,10 @@ class FstPoolProcessor */ template inline FstPoolProcessor make_fst_pool_processor( - WindowAveragePolicy window_average_policy, std::vector const& pool_sizes, Args... args ) { - FstPoolProcessor result( window_average_policy ); + FstPoolProcessor result; for( size_t i = 0; i < pool_sizes.size(); ++i ) { for( size_t j = i + 1; j < pool_sizes.size(); ++j ) { result.add_calculator( @@ -427,12 +416,11 @@ inline FstPoolProcessor make_fst_pool_processor( */ template inline FstPoolProcessor make_fst_pool_processor( - WindowAveragePolicy window_average_policy, std::vector> const& sample_pairs, std::vector const& pool_sizes, Args... args ) { - FstPoolProcessor result( window_average_policy ); + FstPoolProcessor result; for( auto const& p : sample_pairs ) { if( p.first >= pool_sizes.size() || p.second >= pool_sizes.size() ) { throw std::invalid_argument( @@ -465,12 +453,11 @@ inline FstPoolProcessor make_fst_pool_processor( */ template inline FstPoolProcessor make_fst_pool_processor( - WindowAveragePolicy window_average_policy, size_t index, std::vector const& pool_sizes, Args... args ) { - FstPoolProcessor result( window_average_policy ); + FstPoolProcessor result; for( size_t i = 0; i < pool_sizes.size(); ++i ) { result.add_calculator( index, i, @@ -495,12 +482,11 @@ inline FstPoolProcessor make_fst_pool_processor( */ template inline FstPoolProcessor make_fst_pool_processor( - WindowAveragePolicy window_average_policy, size_t index_1, size_t index_2, std::vector const& pool_sizes, Args... args ) { - FstPoolProcessor result( window_average_policy ); + FstPoolProcessor result; result.add_calculator( index_1, index_2, ::genesis::utils::make_unique( diff --git a/lib/genesis/population/function/fst_pool_unbiased.hpp b/lib/genesis/population/function/fst_pool_unbiased.hpp index ced04b05..79f1ed2f 100644 --- a/lib/genesis/population/function/fst_pool_unbiased.hpp +++ b/lib/genesis/population/function/fst_pool_unbiased.hpp @@ -31,8 +31,13 @@ * @ingroup population */ +#include "genesis/population/filter/filter_stats.hpp" +#include "genesis/population/filter/filter_status.hpp" +#include "genesis/population/filter/sample_counts_filter.hpp" +#include "genesis/population/filter/variant_filter.hpp" #include "genesis/population/function/fst_pool_calculator.hpp" #include "genesis/population/function/functions.hpp" +#include "genesis/population/function/window_average.hpp" #include "genesis/utils/core/std.hpp" #include "genesis/utils/math/common.hpp" #include "genesis/utils/math/compensated_sum.hpp" @@ -76,7 +81,7 @@ class FstPoolCalculatorUnbiased final : public BaseFstPoolCalculator public: // ------------------------------------------------------------------------- - // Constructors and Rule of Five + // Typedefs and Enums // ------------------------------------------------------------------------- enum class Estimator @@ -85,9 +90,26 @@ class FstPoolCalculatorUnbiased final : public BaseFstPoolCalculator kHudson }; - FstPoolCalculatorUnbiased( size_t p1_poolsize, size_t p2_poolsize, Estimator est = Estimator::kNei ) - : p1_poolsize_( p1_poolsize ) - , p2_poolsize_( p2_poolsize ) + struct PiValues + { + double pi_within; + double pi_between; + double pi_total; + }; + + // ------------------------------------------------------------------------- + // Constructors and Rule of Five + // ------------------------------------------------------------------------- + + FstPoolCalculatorUnbiased( + size_t smp1_poolsize, + size_t smp2_poolsize, + WindowAveragePolicy window_average_policy, + Estimator est = Estimator::kNei + ) + : smp1_poolsize_( smp1_poolsize ) + , smp2_poolsize_( smp2_poolsize ) + , avg_policy_( window_average_policy ) , estimator_( est ) {} @@ -100,88 +122,142 @@ class FstPoolCalculatorUnbiased final : public BaseFstPoolCalculator FstPoolCalculatorUnbiased& operator= ( FstPoolCalculatorUnbiased&& ) = default; // ------------------------------------------------------------------------- - // Additional Members + // Abstract Members // ------------------------------------------------------------------------- - /** - * @brief Get both variants of FST, following Nei, and following Hudson, as a pair. - */ - std::pair get_result_pair() const - { - // Final computation of our two FST estimators, using Nei and Hudson, respectively. - double const fst_nei = 1.0 - ( pi_w_sum_ / pi_t_sum_ ); - double const fst_hud = 1.0 - ( pi_w_sum_ / pi_b_sum_ ); - return { fst_nei, fst_hud }; - } +private: - double get_pi_within() const + void reset_() override { - return pi_w_sum_; + // Reset the internal counters, but not the pool sizes, + // so that the instance can be reused across windows. + sample_filter_stats_smp1_.clear(); + sample_filter_stats_smp2_.clear(); + sample_filter_stats_b_.clear(); + pi_w_smp1_sum_ = 0.0; + pi_w_smp2_sum_ = 0.0; + pi_b_sum_ = 0.0; } - double get_pi_between() const + void process_( SampleCounts const& smp1, SampleCounts const& smp2 ) override { - return pi_b_sum_; - } + // Helper function for the two components of pi within + auto pi_within_partial_ = []( + double poolsize, double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt + ) { + assert( poolsize > 1.0 ); - double get_pi_total() const - { - return pi_t_sum_; - } + double result = 1.0; + result -= utils::squared( freq_a ); + result -= utils::squared( freq_c ); + result -= utils::squared( freq_g ); + result -= utils::squared( freq_t ); + result *= nt_cnt / ( nt_cnt - 1.0 ); + result *= poolsize / ( poolsize - 1.0 ); + return result; + }; - // ------------------------------------------------------------------------- - // Abstract Members - // ------------------------------------------------------------------------- + // Prepare frequencies in sample 1. We only compute them when used. + double smp1_nt_cnt = 0.0; + double smp1_freq_a = 0.0; + double smp1_freq_c = 0.0; + double smp1_freq_g = 0.0; + double smp1_freq_t = 0.0; + + // Prepare frequencies in sample 2. We only compute them when used. + double smp2_nt_cnt = 0.0; + double smp2_freq_a = 0.0; + double smp2_freq_c = 0.0; + double smp2_freq_g = 0.0; + double smp2_freq_t = 0.0; + + // Compute pi within for smp1 + ++sample_filter_stats_smp1_[ smp1.status.get() ]; + if( smp1.status.passing() ) { + smp1_nt_cnt = static_cast( nucleotide_sum( smp1 )); + smp1_freq_a = static_cast( smp1.a_count ) / smp1_nt_cnt; + smp1_freq_c = static_cast( smp1.c_count ) / smp1_nt_cnt; + smp1_freq_g = static_cast( smp1.g_count ) / smp1_nt_cnt; + smp1_freq_t = static_cast( smp1.t_count ) / smp1_nt_cnt; + + auto const pi_within_smp1 = pi_within_partial_( + smp1_poolsize_, smp1_freq_a, smp1_freq_c, smp1_freq_g, smp1_freq_t, smp1_nt_cnt + ); + if( std::isfinite( pi_within_smp1 )) { + pi_w_smp1_sum_ += pi_within_smp1; + assert( smp1.a_count + smp1.c_count + smp1.g_count + smp1.t_count > 0 ); + } else { + assert( smp1.a_count + smp1.c_count + smp1.g_count + smp1.t_count <= 1 ); + } + } -private: + // Compute pi within for smp2 + ++sample_filter_stats_smp2_[ smp2.status.get() ]; + if( smp2.status.passing() ) { + smp2_nt_cnt = static_cast( nucleotide_sum( smp2 )); + smp2_freq_a = static_cast( smp2.a_count ) / smp2_nt_cnt; + smp2_freq_c = static_cast( smp2.c_count ) / smp2_nt_cnt; + smp2_freq_g = static_cast( smp2.g_count ) / smp2_nt_cnt; + smp2_freq_t = static_cast( smp2.t_count ) / smp2_nt_cnt; + + auto const pi_within_smp2 = pi_within_partial_( + smp2_poolsize_, smp2_freq_a, smp2_freq_c, smp2_freq_g, smp2_freq_t, smp2_nt_cnt + ); + if( std::isfinite( pi_within_smp2 )) { + pi_w_smp2_sum_ += pi_within_smp2; + assert( smp2.a_count + smp2.c_count + smp2.g_count + smp2.t_count > 0 ); + } else { + assert( smp2.a_count + smp2.c_count + smp2.g_count + smp2.t_count <= 1 ); + } + } - void reset_() override - { - // Reset the internal counters, but not the pool sizes, so that the object can be reused. - pi_w_sum_ = 0.0; - pi_b_sum_ = 0.0; - pi_t_sum_ = 0.0; - } + // Compute pi between + if( smp1.status.passing() && smp2.status.passing() ) { + double pi_between = 1.0; + pi_between -= smp1_freq_a * smp2_freq_a; + pi_between -= smp1_freq_c * smp2_freq_c; + pi_between -= smp1_freq_g * smp2_freq_g; + pi_between -= smp1_freq_t * smp2_freq_t; + if( std::isfinite( pi_between )) { + pi_b_sum_ += pi_between; + } - void process_( SampleCounts const& p1, SampleCounts const& p2 ) override - { - // Compute pi values for the snp. - // The tuple `pi_snp` returns pi within, between, and total, in that order. - auto const pi_snp = f_st_pool_unbiased_pi_snp( p1_poolsize_, p2_poolsize_, p1, p2 ); - - // Skip invalid entries than can happen when less than two of [ACGT] have - // counts > 0 in one of the SampleCounts samples. - if( - std::isfinite( std::get<0>( pi_snp )) && - std::isfinite( std::get<1>( pi_snp )) && - std::isfinite( std::get<2>( pi_snp )) - ) { - // If we are here, both p1 and p2 have counts. Let's assert. - assert( p1.a_count + p1.c_count + p1.g_count + p1.t_count > 0 ); - assert( p2.a_count + p2.c_count + p2.g_count + p2.t_count > 0 ); - - // Now add them to the tally. - pi_w_sum_ += std::get<0>( pi_snp ); - pi_b_sum_ += std::get<1>( pi_snp ); - pi_t_sum_ += std::get<2>( pi_snp ); + ++sample_filter_stats_b_[SampleCountsFilterTag::kPassed]; + } else if( ! smp1.status.passing() && ! smp2.status.passing() ) { + ++sample_filter_stats_b_[ std::min( smp1.status.get(), smp2.status.get() )]; } else { - // If we are here, at least one of the two inputs has one or fewer counts in [ACGT], - // otherwise, the results would have been finite. Let's assert this. - assert( - ( p1.a_count + p1.c_count + p1.g_count + p1.t_count <= 1 ) || - ( p2.a_count + p2.c_count + p2.g_count + p2.t_count <= 1 ) - ); + ++sample_filter_stats_b_[ std::max( smp1.status.get(), smp2.status.get() )]; } } double get_result_() const override { + // We have a bit of an unfortunate situation here from a software design point of view. + // The other fst calculator classes (Kofler and Karlsson, at the moment) do not use the + // window normalization, and it would be weird to use dummies there. So instead, in the + // FstPoolProcessor, we cast accordingly (which is ugly as well), to avoid this. + // We need a dummy implementation of the result function here, to make the compiler happy. + throw std::domain_error( + "FstPoolCalculatorUnbiased::get_result() needs to be called via its overload" + ); + } + + // ------------------------------------------------------------------------- + // Additional Members + // ------------------------------------------------------------------------- + +public: + + double get_result( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { switch( estimator_ ) { case Estimator::kNei: { - return get_result_pair().first; + return get_result_pair( window_length, variant_filter_stats ).first; } case Estimator::kHudson: { - return get_result_pair().second; + return get_result_pair( window_length, variant_filter_stats ).second; } default: { throw std::invalid_argument( "Invalid FstPoolCalculatorUnbiased::Estimator" ); @@ -190,74 +266,74 @@ class FstPoolCalculatorUnbiased final : public BaseFstPoolCalculator return 0.0; } - // ------------------------------------------------------------------------- - // Helper Functions - // ------------------------------------------------------------------------- - -public: - - /** - * @brief Compute the SNP-based Theta Pi values used in f_st_pool_unbiased(). - * - * The function returns pi within, between, and total, in that order. - * See f_st_pool_unbiased() for details. - */ - static std::tuple f_st_pool_unbiased_pi_snp( - size_t p1_poolsize, size_t p2_poolsize, - SampleCounts const& p1_counts, SampleCounts const& p2_counts - ) { - using namespace genesis::utils; + double get_pi_within( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { + auto const pi_w_smp1 = pi_w_smp1_sum_.get() / window_average_denominator( + avg_policy_, window_length, variant_filter_stats, sample_filter_stats_smp1_ + ); + auto const pi_w_smp2 = pi_w_smp2_sum_.get() / window_average_denominator( + avg_policy_, window_length, variant_filter_stats, sample_filter_stats_smp2_ + ); + return 0.5 * ( pi_w_smp1 + pi_w_smp2 ); + } - // There is some code duplication from f_st_pool_kofler_pi_snp() here, but for speed reasons, - // we keep it that way for now. Not worth calling more functions than needed. + double get_pi_between( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { + return pi_b_sum_.get() / window_average_denominator( + avg_policy_, window_length, variant_filter_stats, sample_filter_stats_b_ + ); + } - // Helper function for the two components of pi within - auto pi_within_partial_ = []( - double poolsize, double freq_a, double freq_c, double freq_g, double freq_t, double nt_cnt - ) { - assert( poolsize > 1.0 ); + double get_pi_total( double pi_within, double pi_between ) const + { + return 0.5 * ( pi_within + pi_between ); + } - double result = 1.0; - result -= squared( freq_a ); - result -= squared( freq_c ); - result -= squared( freq_g ); - result -= squared( freq_t ); - result *= nt_cnt / ( nt_cnt - 1.0 ); - result *= poolsize / ( poolsize - 1.0 ); - return result; - }; + double get_pi_total( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { + auto const pi_within = get_pi_within( window_length, variant_filter_stats ); + auto const pi_between = get_pi_between( window_length, variant_filter_stats ); + return get_pi_total( pi_within, pi_between ); + } - // Get frequencies in sample 1 - double const p1_nt_cnt = static_cast( nucleotide_sum( p1_counts )); - double const p1_freq_a = static_cast( p1_counts.a_count ) / p1_nt_cnt; - double const p1_freq_c = static_cast( p1_counts.c_count ) / p1_nt_cnt; - double const p1_freq_g = static_cast( p1_counts.g_count ) / p1_nt_cnt; - double const p1_freq_t = static_cast( p1_counts.t_count ) / p1_nt_cnt; - - // Get frequencies in sample 2 - double const p2_nt_cnt = static_cast( nucleotide_sum( p2_counts )); - double const p2_freq_a = static_cast( p2_counts.a_count ) / p2_nt_cnt; - double const p2_freq_c = static_cast( p2_counts.c_count ) / p2_nt_cnt; - double const p2_freq_g = static_cast( p2_counts.g_count ) / p2_nt_cnt; - double const p2_freq_t = static_cast( p2_counts.t_count ) / p2_nt_cnt; - - // Compute pi within - auto const pi_within = 0.5 * ( - pi_within_partial_( p1_poolsize, p1_freq_a, p1_freq_c, p1_freq_g, p1_freq_t, p1_nt_cnt ) + - pi_within_partial_( p2_poolsize, p2_freq_a, p2_freq_c, p2_freq_g, p2_freq_t, p2_nt_cnt ) - ); + PiValues get_pi_values( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { + PiValues result; + result.pi_within = get_pi_within( window_length, variant_filter_stats ); + result.pi_between = get_pi_between( window_length, variant_filter_stats ); + result.pi_total = get_pi_total( result.pi_within, result.pi_between ); + return result; + } - // Compute pi between - double pi_between = 1.0; - pi_between -= p1_freq_a * p2_freq_a; - pi_between -= p1_freq_c * p2_freq_c; - pi_between -= p1_freq_g * p2_freq_g; - pi_between -= p1_freq_t * p2_freq_t; + /** + * @brief Get both variants of FST, following Nei, and following Hudson, as a pair. + */ + std::pair get_result_pair( + size_t window_length, + VariantFilterStats const& variant_filter_stats + ) const { + // Get the components that we need, each normalized using their own filter stats. + auto const pi_within = get_pi_within( window_length, variant_filter_stats ); + auto const pi_between = get_pi_between( window_length, variant_filter_stats ); + auto const pi_total = get_pi_total( pi_within, pi_between ); - // Compute pi total - double const pi_total = 0.5 * ( pi_within + pi_between ); + // Final computation of our two FST estimators, using Nei and Hudson, respectively. + double const fst_nei = 1.0 - ( pi_within / pi_total ); + double const fst_hud = 1.0 - ( pi_within / pi_between ); + return { fst_nei, fst_hud }; + } - return std::tuple{ pi_within, pi_between, pi_total }; + WindowAveragePolicy get_window_average_policy() const + { + return avg_policy_; } // ------------------------------------------------------------------------- @@ -267,15 +343,22 @@ class FstPoolCalculatorUnbiased final : public BaseFstPoolCalculator private: // Pool sizes - size_t p1_poolsize_ = 0; - size_t p2_poolsize_ = 0; + size_t smp1_poolsize_ = 0; + size_t smp2_poolsize_ = 0; + // Parameters + WindowAveragePolicy avg_policy_; Estimator estimator_ = Estimator::kNei; - // Sums over the window of pi within, between, total. - utils::NeumaierSum pi_w_sum_ = 0.0; - utils::NeumaierSum pi_b_sum_ = 0.0; - utils::NeumaierSum pi_t_sum_ = 0.0; + // Filter stats of everything that is processed here. + SampleCountsFilterStats sample_filter_stats_smp1_; + SampleCountsFilterStats sample_filter_stats_smp2_; + SampleCountsFilterStats sample_filter_stats_b_; + + // Sums over the window of pi within for both pools, and pi between them. + utils::NeumaierSum pi_w_smp1_sum_ = 0.0; + utils::NeumaierSum pi_w_smp2_sum_ = 0.0; + utils::NeumaierSum pi_b_sum_ = 0.0; }; diff --git a/lib/genesis/population/function/window_average.hpp b/lib/genesis/population/function/window_average.hpp index 6144ef8e..b332286b 100644 --- a/lib/genesis/population/function/window_average.hpp +++ b/lib/genesis/population/function/window_average.hpp @@ -142,9 +142,14 @@ inline double window_average_denominator( // in the stats, so this works there as well. We do not simply assert this here, // as a misuse of the filters could result in an error here, which would be on the user // side, and so is an exception. - if( sample_counts_filter_stats.sum() > variant_filter_stats[VariantFilterTag::kPassed] ) { + // We skip this test when using the sum anyway, as in those cases, we might not have + // the correct filter stats available in the first place. + if( + policy != WindowAveragePolicy::kSum && + sample_counts_filter_stats.sum() > variant_filter_stats[VariantFilterTag::kPassed] + ) { throw std::invalid_argument( - "Invalid stat counts with ""sample_counts_filter_stats.sum() > " + "Invalid stat counts with sample_counts_filter_stats.sum() > " "variant_filter_stats[VariantFilterTag::kPassed]" ); } diff --git a/test/src/population/fst_pool.cpp b/test/src/population/fst_pool.cpp index 3731607e..8aecb05b 100644 --- a/test/src/population/fst_pool.cpp +++ b/test/src/population/fst_pool.cpp @@ -402,7 +402,7 @@ TEST( FST, FstPoolProcessor ) // Make an FST processor for the two samples. std::vector const poolsizes{ 100, 100 }; auto processor = make_fst_pool_processor( - WindowAveragePolicy::kSum, poolsizes + poolsizes, WindowAveragePolicy::kSum ); ASSERT_EQ( 1, processor.size() ); processor.thread_pool( Options::get().global_thread_pool() ); @@ -417,7 +417,25 @@ TEST( FST, FstPoolProcessor ) size_t const window_length = 1; auto const result = processor.get_result( window_length ); EXPECT_EQ( 1, result.size() ); - EXPECT_FLOAT_EQ( -0.0041116024, result[0] ); + + // Th FST value changed here since the introduction of the proper window normalization, + // since pi total is computed at the end of each window now, instead of being summed + // during the window. This is a "ratio of averages" vs "average of ratios" difference, + // which now changes the values of our result... + // EXPECT_FLOAT_EQ( -0.0041116024, result[0] ); + EXPECT_FLOAT_EQ( -0.00035794353, result[0] ); + + // Also test the involved pi values for consistency. + // The commented value below is the old pi total value before the redesign. + auto const vecs = processor.get_pi_vectors( window_length ); + EXPECT_EQ( 1, std::get<0>( vecs ).size() ); + EXPECT_EQ( 1, std::get<1>( vecs ).size() ); + EXPECT_EQ( 1, std::get<2>( vecs ).size() ); + EXPECT_FLOAT_EQ( 133.798915747651, std::get<0>( vecs )[0] ); + EXPECT_FLOAT_EQ( 133.703165107056, std::get<1>( vecs )[0] ); + EXPECT_FLOAT_EQ( 133.751040427354, std::get<2>( vecs )[0] ); + // EXPECT_FLOAT_EQ( 133.251040427354, std::get<2>( vecs )[0] ); + } // ================================================================================================= @@ -437,7 +455,7 @@ void test_fst_fuzzy_run_( std::vector const& data ) permuted_congruential_generator( 0, 4 ) ); auto processor = make_fst_pool_processor( - window_average_policy, pool_sizes + pool_sizes, window_average_policy ); // Run the data @@ -450,7 +468,6 @@ void test_fst_fuzzy_run_( std::vector const& data ) size_t const window_length = data.size(); auto const result = processor.get_result( window_length ); EXPECT_EQ( pairs, result.size() ); - // EXPECT_FLOAT_EQ( -0.0041116024, result[0] ); for( auto const& r : result ) { LOG_DBG << r; }