Skip to content

Commit

Permalink
Add genome locus set support to Gapless Stream
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed May 30, 2024
1 parent 030883c commit 6971ee8
Show file tree
Hide file tree
Showing 5 changed files with 290 additions and 34 deletions.
83 changes: 68 additions & 15 deletions lib/genesis/population/stream/variant_gapless_input_stream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,18 @@ VariantGaplessInputStream::Iterator::Iterator( VariantGaplessInputStream* parent
// chromosome, and we can start the processing.
assert( current_locus_.chromosome != "" && current_locus_.position != 0 );
start_chromosome_();
prepare_current_variant_();

// We have just initialized the chrosome, including all cache pointer for the given references.
// We now use that to check that the position where we started is actually covered by the
// genome locus set filter. If not, we move on until we find a covered position.
// This is inefficient, as we are looping over gaps and data that might be filtered out
// immediately again. But it is good enough for now, and the use case is pretty special anyway.
if( current_locus_is_covered_by_genome_locus_set_() ) {
prepare_current_variant_();
} else {
// The advance function will loop until it finds a covered locus.
advance_();
}
}

// =================================================================================================
Expand All @@ -116,22 +127,30 @@ void VariantGaplessInputStream::Iterator::advance_()
// Some basic checks.
assert( parent_ );

// Move the current_locus_, and potentially the input iterator,
// to the next position we want to process.
advance_current_locus_();
// We need to loop until we find a locus that is actually covered by the genome locus set.
// If that is not given anyway, this loop will immediately exit. But in case that we have
// a genome locus set filter, for simplicty, we have implemented this as a loop here.
// At least, we only do the work of preparing the variant at the very end, once we know
// that we want to visit it.
do {
// Move the current_locus_, and potentially the input iterator,
// to the next position we want to process.
advance_current_locus_();

// If there is no next position, we are done.
if( current_locus_.empty() ) {
parent_ = nullptr;
return;
}
assert( current_locus_.chromosome != "" && current_locus_.position != 0 );

// If there is no next position, we are done.
if( current_locus_.empty() ) {
parent_ = nullptr;
return;
}
assert( current_locus_.chromosome != "" && current_locus_.position != 0 );
// If the next position is the start of a chromosome,
// we need to set it up correctly first.
if( current_locus_.position == 1 ) {
start_chromosome_();
}
} while( ! current_locus_is_covered_by_genome_locus_set_() );

// If the next position is the start of a chromosome,
// we need to set it up correctly first.
if( current_locus_.position == 1 ) {
start_chromosome_();
}

// Now we have everything to populate our variant as needed.
prepare_current_variant_();
Expand Down Expand Up @@ -183,6 +202,15 @@ void VariantGaplessInputStream::Iterator::start_chromosome_()
);
}
}

// Lastly, for the genome locus set, we do the same check, but slightly different:
// The set might not contain chromosomes that are filtered out completely anyway,
// and in that case we do not want to fail, but just indicate that be setting the
// cache iterator to the end. That is done by the find function anyway,
// so no further steps or checks are needed in that case.
if( parent_->genome_locus_set_ ) {
genome_locus_set_it_ = parent_->genome_locus_set_->find( chr );
}
}

// -------------------------------------------------------------------------
Expand Down Expand Up @@ -519,5 +547,30 @@ void VariantGaplessInputStream::Iterator::check_input_iterator_()
}
}

// -------------------------------------------------------------------------
// current_locus_is_covered_by_genome_locus_set_
// -------------------------------------------------------------------------

bool VariantGaplessInputStream::Iterator::current_locus_is_covered_by_genome_locus_set_()
{
assert( parent_ );

// Without a given genome locus set, we always consider this position to be covered.
if( ! parent_->genome_locus_set_ ) {
return true;
}

// If we are here, we have a genome locus set filter given.
// If the chromosome is not in the set, that means that the position is not covered.
if( genome_locus_set_it_ == parent_->genome_locus_set_->end() ) {
return false;
}

// If it contains the given chromosome, we use that to determine if the locus is covered.
// We assume that the iterator is at the current chromosome.
assert( genome_locus_set_it_->first == current_locus_.chromosome );
return parent_->genome_locus_set_->is_covered( genome_locus_set_it_, current_locus_.position );
}

} // namespace population
} // namespace genesis
74 changes: 69 additions & 5 deletions lib/genesis/population/stream/variant_gapless_input_stream.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact:
Lucas Czech <[email protected]>
Department of Plant Biology, Carnegie Institution For Science
260 Panama Street, Stanford, CA 94305, USA
Lucas Czech <[email protected]>
University of Copenhagen, Globe Institute, Section for GeoGenetics
Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
*/

/**
Expand All @@ -34,6 +34,7 @@
#include "genesis/population/function/functions.hpp"
#include "genesis/population/function/genome_locus.hpp"
#include "genesis/population/genome_locus.hpp"
#include "genesis/population/genome_locus_set.hpp"
#include "genesis/population/stream/variant_input_stream.hpp"
#include "genesis/population/variant.hpp"
#include "genesis/sequence/reference_genome.hpp"
Expand Down Expand Up @@ -74,6 +75,17 @@ namespace population {
* reference but not in the input data at all (of course, all of them will then only contain missing
* data). This makes sure that the full reference is iterated over.
*
* In some cases, the Variant stream is intended to be subset to particular genomic regions.
* For this, use genome_locus_set() to set a list of the regions to subset to. Note though that
* our current implementation here is slightly inefficient, as we here still first attempt to fill
* in the gaps in the input to some degree, only to then throw them out again if they are to be
* removed by that region filter. This is unfortunate, but a more efficient implementation
* that just skips all those regions in the first place turned out to be quite involved due to the
* interactions between the data stream, reference dict, and region filters, and we did not attempt
* to make this work for now. The current implementation is however still slightly more efficient
* than applying the region filter afterwards, as we are at least able to skip part of the process
* for the filtered positions.
*
* The iterator is useful in siutations where input is expected to have missing data, so that it's
* skipped by its iterator, but some external algorithm or processing wants to use all the positions.
* For instance, when writing a sync file, this can be used to make a "gsync" file that contains
Expand Down Expand Up @@ -295,6 +307,14 @@ class VariantGaplessInputStream
*/
void check_input_iterator_();

/**
* @brief Check if the curret locus is covered by the genome locus set.
*
* This returns `false` only if a genome locus set filter is given, and does not cover
* the given position. Without a genome locus set filter, this always returns `true`.
*/
bool current_locus_is_covered_by_genome_locus_set_();

/**
* @brief Get the Variant at the current position.
*
Expand Down Expand Up @@ -350,10 +370,11 @@ class VariantGaplessInputStream
// as they are themselves able to tell us if they are still good (via their operator bool).
VariantInputStream::Iterator iterator_;

// Cache for the ref genome and seq dict sequences, so that we do not have to find them
// on every iteration here.
// Cache for the ref genome and seq dict sequences, as well as the genome locus set,
// so that we do not have to find them on every iteration here.
::genesis::sequence::ReferenceGenome::const_iterator ref_genome_it_;
::genesis::sequence::SequenceDict::const_iterator seq_dict_it_;
::genesis::population::GenomeLocusSet::const_iterator genome_locus_set_it_;

// We keep track of which chromosomes we have seen yet, in order to avoid errors due to
// unordered input, and also to know which chromosomes to process later for the case that
Expand Down Expand Up @@ -535,6 +556,48 @@ class VariantGaplessInputStream
return *this;
}

/**
* @brief Get the currently set GenomeLocusSet for subsetting the iteration positions.
*/
std::shared_ptr<GenomeLocusSet> genome_locus_set() const
{
return genome_locus_set_;
}

/**
* @brief Set a genomic locus set for subsetting the iteration positions.
*
* Only positions listed in the povided set are iterated. This has the same effect as filtering
* out any positions that are not covered in the provided set _after_ applying this gapless
* iterator. That means, any gaps of uncovered positions in the given genome locus set will
* still be gaps in the iteration here - they are not filled in. The main purpose of this is
* hence to filter for larger regions, and not for individual positions such as SNPs
* (in that case, doing a gapless iteration on top of SNP filtering wouldn't make sense in the
* first place anyway).
*
* This is recommended in order to avoid unnecessary computations when subsetting the
* Variant stream to certain chromosomes or regions. If this was not used, the following would
* happen: On the one hand, if the input stream was already subset to some regions, then this
* gapless iterator would re-introduce any positions that were previously removed, but with
* missing data, which is likely not what we want. On the other hand, if the subsetting to
* regions was done after using this gapless iterator, a lot of uncessary positions would first
* be iterated, only to then be removed again if they are not in the required regions.
*
* So instead, setting the region filter here already makes sure that we only iterate the
* regions and positions that are actually needed.
*/
self_type& genome_locus_set( std::shared_ptr<GenomeLocusSet> value )
{
if( started_ ) {
throw std::runtime_error(
"VariantGaplessInputStream::genome_locus_set() cannot be called "
"once the iteration has been started with begin()."
);
}
genome_locus_set_ = value;
return *this;
}

// -------------------------------------------------------------------------
// Internal Members
// -------------------------------------------------------------------------
Expand All @@ -550,6 +613,7 @@ class VariantGaplessInputStream
// Also, we here subset to regions if needed, to avoid unnecessary work later.
std::shared_ptr<::genesis::sequence::ReferenceGenome> ref_genome_;
std::shared_ptr<::genesis::sequence::SequenceDict> seq_dict_;
std::shared_ptr<GenomeLocusSet> genome_locus_set_;

};

Expand Down
28 changes: 25 additions & 3 deletions lib/genesis/population/stream/variant_input_stream_adapters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact:
Lucas Czech <[email protected]>
Department of Plant Biology, Carnegie Institution For Science
260 Panama Street, Stanford, CA 94305, USA
Lucas Czech <[email protected]>
University of Copenhagen, Globe Institute, Section for GeoGenetics
Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
*/

/**
Expand Down Expand Up @@ -143,6 +143,28 @@ VariantInputStream make_variant_gapless_input_stream(
return make_variant_input_stream_from_variant_gapless_input_stream( gapless_input );
}

VariantInputStream make_variant_gapless_input_stream(
VariantInputStream const& input,
std::shared_ptr<::genesis::sequence::ReferenceGenome> ref_genome,
std::shared_ptr<GenomeLocusSet> genome_locus_set
) {
auto gapless_input = VariantGaplessInputStream( input );
gapless_input.reference_genome( ref_genome );
gapless_input.genome_locus_set( genome_locus_set );
return make_variant_input_stream_from_variant_gapless_input_stream( gapless_input );
}

VariantInputStream make_variant_gapless_input_stream(
VariantInputStream const& input,
std::shared_ptr<::genesis::sequence::SequenceDict> seq_dict,
std::shared_ptr<GenomeLocusSet> genome_locus_set
) {
auto gapless_input = VariantGaplessInputStream( input );
gapless_input.sequence_dict( seq_dict );
gapless_input.genome_locus_set( genome_locus_set );
return make_variant_input_stream_from_variant_gapless_input_stream( gapless_input );
}

VariantInputStream make_variant_input_stream_from_variant_gapless_input_stream(
VariantGaplessInputStream const& gapless_input
) {
Expand Down
35 changes: 32 additions & 3 deletions lib/genesis/population/stream/variant_input_stream_adapters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact:
Lucas Czech <[email protected]>
Department of Plant Biology, Carnegie Institution For Science
260 Panama Street, Stanford, CA 94305, USA
Lucas Czech <[email protected]>
University of Copenhagen, Globe Institute, Section for GeoGenetics
Oster Voldgade 5-7, 1350 Copenhagen K, Denmark
*/

/**
Expand All @@ -31,6 +31,7 @@
* @ingroup population
*/

#include "genesis/population/genome_locus_set.hpp"
#include "genesis/population/sample_counts.hpp"
#include "genesis/population/stream/variant_gapless_input_stream.hpp"
#include "genesis/population/stream/variant_input_stream_sources.hpp"
Expand Down Expand Up @@ -112,6 +113,34 @@ VariantInputStream make_variant_gapless_input_stream(
std::shared_ptr<::genesis::sequence::SequenceDict> seq_dict
);

/**
* @copybrief make_variant_gapless_input_stream( VariantInputStream const& )
*
* This overload additionally sets the reference genome for the gapless iteration,
* as well as a genome locus set to filter the positions.
*
* See also make_variant_input_stream_from_variant_gapless_input_stream()
*/
VariantInputStream make_variant_gapless_input_stream(
VariantInputStream const& input,
std::shared_ptr<::genesis::sequence::ReferenceGenome> ref_genome,
std::shared_ptr<GenomeLocusSet> genome_locus_set
);

/**
* @copybrief make_variant_gapless_input_stream( VariantInputStream const& )
*
* This overload additionally sets the sequence dictionary for the gapless iteration,
* as well as a genome locus set to filter the positions.
*
* See also make_variant_input_stream_from_variant_gapless_input_stream()
*/
VariantInputStream make_variant_gapless_input_stream(
VariantInputStream const& input,
std::shared_ptr<::genesis::sequence::SequenceDict> seq_dict,
std::shared_ptr<GenomeLocusSet> genome_locus_set
);

/**
* @brief Create a VariantInputStream that wraps a VariantGaplessInputStream.
*
Expand Down
Loading

0 comments on commit 6971ee8

Please sign in to comment.