Skip to content

Commit

Permalink
Add provided loci mask for window averaging
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Jul 31, 2024
1 parent b0dac5f commit f7c74c4
Show file tree
Hide file tree
Showing 8 changed files with 338 additions and 66 deletions.
36 changes: 29 additions & 7 deletions lib/genesis/population/function/diversity_pool_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@
* @ingroup population
*/

#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/population/function/diversity_pool_calculator.hpp"
#include "genesis/population/function/window_average.hpp"
#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/population/genome_locus_set.hpp"
#include "genesis/population/variant.hpp"
#include "genesis/population/window/base_window.hpp"
#include "genesis/utils/core/options.hpp"
#include "genesis/utils/core/std.hpp"
#include "genesis/utils/threading/thread_functions.hpp"
Expand Down Expand Up @@ -240,22 +242,42 @@ class DiversityPoolProcessor
/**
* @brief Get a list of all resulting values for all samples.
*
* This _always_ takes the @p window_length as input, even if the WindowAveragePolicy does
* not require it. This is meant to make sure that we at least keep track of the right things
* when doing any computations, and cannot forget about this.
* This _always_ takes the @p window and @p provided_loci as input, even if the
* WindowAveragePolicy does not require it. This is meant to make sure that we at least keep
* track of the right things when doing any computations, and cannot forget about this.
* For cases where the result is needed without averaging over windows (that is, just the sum
* of all per site values), there is an overload of this function.
*/
std::vector<DiversityPoolCalculator::Result> const& get_result( size_t window_length ) const
{
template<class D>
std::vector<DiversityPoolCalculator::Result> const& get_result(
BaseWindow<D> const& window,
std::shared_ptr<GenomeLocusSet> provided_loci
) const {
assert( results_.size() == calculators_.size() );
for( size_t i = 0; i < results_.size(); ++i ) {
auto const window_avg_denom = window_average_denominator(
avg_policy_, window_length, filter_stats_, calculators_[i].get_filter_stats()
avg_policy_, window, provided_loci, filter_stats_, calculators_[i].get_filter_stats()
);
results_[i] = calculators_[i].get_result( window_avg_denom );
}
return results_;
}

/**
* @brief Get a list of all resulting values for all samples.
*
* This overload does not consider the window averaging, and simply returns the sum of all
* per site values.
*/
std::vector<DiversityPoolCalculator::Result> const& get_result() const
{
assert( results_.size() == calculators_.size() );
for( size_t i = 0; i < results_.size(); ++i ) {
results_[i] = calculators_[i].get_result( 1 );
}
return results_;
}

/**
* @brief Get the sum of filter statistics of all Variant%s processed here.
*
Expand Down
9 changes: 8 additions & 1 deletion lib/genesis/population/function/fst_cathedral.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,11 @@ void fill_fst_cathedral_records_from_processor_(
"Invalid FstPoolCalculator that is not FstPoolCalculatorUnbiased"
);
}

// We are only using the sum at the moment, to keep it simple.
// The below function to get the pi values is a shortcut without window averaging,
// which always returns the sum anyway, so the policy here is ignored either way.
// But we check it, in order to make sure the user set up everything as intended.
if( fst_calc->get_window_average_policy() != WindowAveragePolicy::kSum ) {
throw std::runtime_error(
"In compute_fst_cathedral_records_for_chromosome(): "
Expand All @@ -202,7 +207,9 @@ void fill_fst_cathedral_records_from_processor_(
// Now add the entry for the current calculator to its respective records entry.
// We rely on the amortized complexity here - cannot pre-allocate the size,
// as we do not know how many positions will actually be in the input beforehand.
auto const pis = fst_calc->get_pi_values( 0, processor.get_filter_stats() );
// For the window averaging, we do not need the window details or mask,
// so we use the overload here that does no window averaging.
auto const pis = fst_calc->get_pi_values();
records[i].entries.emplace_back(
position, pis.pi_within, pis.pi_between, pis.pi_total
);
Expand Down
5 changes: 2 additions & 3 deletions lib/genesis/population/function/fst_pool_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -370,9 +370,8 @@ std::pair<double, double> f_st_pool_unbiased(
);
}

// Unfortunately, we need dummies here now for the window based counters. They are not used
// with the above WindowAveragePolicy::kSum policy, but need to be provided nonetheless...
return calc.get_result_pair( 0, VariantFilterStats{} );
// We use the result overload here that does not do window averaging, and just returns the sum.
return calc.get_result_pair();
}

#if __cplusplus >= 201402L
Expand Down
101 changes: 83 additions & 18 deletions lib/genesis/population/function/fst_pool_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@
* @ingroup population
*/

#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/population/function/fst_pool_calculator.hpp"
#include "genesis/population/function/fst_pool_unbiased.hpp"
#include "genesis/population/function/window_average.hpp"
#include "genesis/population/filter/variant_filter.hpp"
#include "genesis/population/genome_locus_set.hpp"
#include "genesis/population/variant.hpp"
#include "genesis/population/window/base_window.hpp"
#include "genesis/utils/core/options.hpp"
#include "genesis/utils/core/std.hpp"
#include "genesis/utils/threading/thread_functions.hpp"
Expand Down Expand Up @@ -254,12 +256,17 @@ class FstPoolProcessor
/**
* @brief Get a list of all resulting FST values for all pairs of samples.
*
* This _always_ takes the @p window_length as input, even if the WindowAveragePolicy does
* This _always_ takes the @p window and @p provided_loci as input, which are components needed
* for some of the window averaging policies, even if the WindowAveragePolicy does
* not require it. This is meant to make sure that we at least keep track of the right things
* when doing any computations, and cannot forget about this.
* There is an overload of this function which does not need this, and always returns the sum.
*/
std::vector<double> const& get_result( size_t window_length ) const
{
template<class D>
std::vector<double> const& get_result(
BaseWindow<D> const& window,
std::shared_ptr<GenomeLocusSet> provided_loci
) const {
assert( results_.size() == calculators_.size() );
for( size_t i = 0; i < results_.size(); ++i ) {
// We do an ugly dispatch here to treat the special case of the FstPoolCalculatorUnbiased
Expand All @@ -271,14 +278,30 @@ class FstPoolProcessor
auto const* raw_calc = calculators_[i].get();
auto const* unbiased_calc = dynamic_cast<FstPoolCalculatorUnbiased const*>( raw_calc );
if( unbiased_calc ) {
results_[i] = unbiased_calc->get_result( window_length, filter_stats_ );
results_[i] = unbiased_calc->get_result( window, provided_loci, filter_stats_ );
} else {
results_[i] = raw_calc->get_result();
}
}
return results_;
}

/**
* @brief Get a list of all resulting FST values for all pairs of samples.
*
* This overload does not use window averaging, and always returns the sum.
*/
std::vector<double> const& get_result() const
{
assert( results_.size() == calculators_.size() );
for( size_t i = 0; i < results_.size(); ++i ) {
// No dispatch here as in the above overload. Instead, we just use the result function
// that does not use window averaging directly.
results_[i] = calculators_[i]->get_result();
}
return results_;
}

/**
* @brief Get lists of all the three intermediate pi values (within, between, total) that
* are part of our unbiased estimators.
Expand All @@ -290,20 +313,15 @@ class FstPoolProcessor
* This only works when all calculators are of type FstPoolCalculatorUnbiased, and throws an
* exception otherwise. It is merely meant as a convenience function for that particular case.
*/
PiVectorTuple const& get_pi_vectors( size_t window_length ) const
{
// Only allocate when someone first calls this.
// Does not do anything afterwards.
auto const res_sz = calculators_.size();
assert( std::get<0>( results_pi_ ).size() == 0 || std::get<0>( results_pi_ ).size() == res_sz );
assert( std::get<1>( results_pi_ ).size() == 0 || std::get<1>( results_pi_ ).size() == res_sz );
assert( std::get<2>( results_pi_ ).size() == 0 || std::get<2>( results_pi_ ).size() == res_sz );
std::get<0>( results_pi_ ).resize( res_sz );
std::get<1>( results_pi_ ).resize( res_sz );
std::get<2>( results_pi_ ).resize( res_sz );
template<class D>
PiVectorTuple const& get_pi_vectors(
BaseWindow<D> const& window,
std::shared_ptr<GenomeLocusSet> provided_loci
) const {
allocate_pi_result_vectors_();

// Get the pi values from all calculators, assuming that they are of the correct type.
for( size_t i = 0; i < res_sz; ++i ) {
for( size_t i = 0; i < calculators_.size(); ++i ) {
auto const& raw_calc = calculators_[i];
auto const* unbiased_calc = dynamic_cast<FstPoolCalculatorUnbiased const*>( raw_calc.get() );
if( ! unbiased_calc ) {
Expand All @@ -316,7 +334,35 @@ class FstPoolProcessor
// We compute the window-averaged values here.
// Unfortunately, we need to copy this value-by-value, as we want to return
// three independent vectors for user convenienec on the caller's end.
auto const pis = unbiased_calc->get_pi_values( window_length, filter_stats_ );
auto const pis = unbiased_calc->get_pi_values( window, provided_loci, filter_stats_ );
std::get<0>(results_pi_)[i] = pis.pi_within;
std::get<1>(results_pi_)[i] = pis.pi_between;
std::get<2>(results_pi_)[i] = pis.pi_total;
}

return results_pi_;
}

/**
* @brief Get lists of all the three intermediate pi values (within, between, total) that
* are part of our unbiased estimators.
*
* This overload ignores the window average policy, and just returns the sum.
*/
PiVectorTuple const& get_pi_vectors() const
{
// Same as above, but using a different get_pi_vectors() overload.
allocate_pi_result_vectors_();
for( size_t i = 0; i < calculators_.size(); ++i ) {
auto const& raw_calc = calculators_[i];
auto const* unbiased_calc = dynamic_cast<FstPoolCalculatorUnbiased const*>( raw_calc.get() );
if( ! unbiased_calc ) {
throw std::domain_error(
"Can only call FstPoolProcessor::get_pi_vectors() "
"for calculators of type FstPoolCalculatorUnbiased."
);
}
auto const pis = unbiased_calc->get_pi_values();
std::get<0>(results_pi_)[i] = pis.pi_within;
std::get<1>(results_pi_)[i] = pis.pi_between;
std::get<2>(results_pi_)[i] = pis.pi_total;
Expand Down Expand Up @@ -351,6 +397,25 @@ class FstPoolProcessor
// Internal Members
// -------------------------------------------------------------------------

private:

void allocate_pi_result_vectors_() const
{
// Only allocate when someone first calls this.
// Does not do anything afterwards.
auto const res_sz = calculators_.size();
assert( std::get<0>( results_pi_ ).size() == 0 || std::get<0>( results_pi_ ).size() == res_sz );
assert( std::get<1>( results_pi_ ).size() == 0 || std::get<1>( results_pi_ ).size() == res_sz );
assert( std::get<2>( results_pi_ ).size() == 0 || std::get<2>( results_pi_ ).size() == res_sz );
std::get<0>( results_pi_ ).resize( res_sz );
std::get<1>( results_pi_ ).resize( res_sz );
std::get<2>( results_pi_ ).resize( res_sz );
}

// -------------------------------------------------------------------------
// Internal Members
// -------------------------------------------------------------------------

private:

// The pairs of sample indices of the variant between which we want to compute FST,
Expand Down
Loading

0 comments on commit f7c74c4

Please sign in to comment.