Skip to content

Commit

Permalink
Merge pull request #2935 from jltsiren/master
Browse files Browse the repository at this point in the history
Return all non-overlapping full-length extensions
  • Loading branch information
jltsiren authored Aug 1, 2020
2 parents a25a13d + 3148019 commit aa10355
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 58 deletions.
46 changes: 29 additions & 17 deletions src/gapless_extender.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,8 +267,8 @@ void match_backward(GaplessExtension& match, const std::string& seq, gbwtgraph::
}

// Sort full-length extensions by internal_score, remove ones that are not
// full-length alignments, remove duplicates, and return the best two
// extensions that have sufficiently low overlap.
// full-length alignments, remove duplicates, and return the best extensions
// that have sufficiently low overlap.
void handle_full_length(const HandleGraph& graph, std::vector<GaplessExtension>& result, double overlap_threshold) {
std::sort(result.begin(), result.end(), [](const GaplessExtension& a, const GaplessExtension& b) -> bool {
if (a.full() && b.full()) {
Expand All @@ -281,20 +281,20 @@ void handle_full_length(const HandleGraph& graph, std::vector<GaplessExtension>&
if (!(result[i].full())) {
break; // No remaining full-length extensions.
}
if (tail == 0) {
tail = 1; // Best extension.
} else {
if (result[i] == result.front()) {
continue; // Duplicate.
}
if (result[i].overlap(graph, result.front()) <= overlap_threshold * result.front().length()) {
if (i > tail) {
result[tail] = std::move(result[i]);
}
tail++;
break; // Found a non-overlapping extension.
bool overlap = false;
for (size_t prev = 0; prev < tail; prev++) {
if (result[i].overlap(graph, result[prev]) > overlap_threshold * result[prev].length()) {
overlap = true;
break;
}
}
if (overlap) {
continue;
}
if (i > tail) {
result[tail] = std::move(result[i]);
}
tail++;
}
result.resize(tail);
}
Expand All @@ -311,7 +311,13 @@ void remove_duplicates(std::vector<GaplessExtension>& result) {
if (a.state.forward.node != b.state.forward.node) {
return (a.state.forward.node < b.state.forward.node);
}
return (a.state.backward.range < b.state.backward.range);
if (a.state.backward.range != b.state.backward.range) {
return (a.state.backward.range < b.state.backward.range);
}
if (a.state.forward.range != b.state.forward.range) {
return (a.state.forward.range < b.state.forward.range);
}
return (a.offset < b.offset);
};
std::sort(result.begin(), result.end(), sort_order);
size_t tail = 0;
Expand Down Expand Up @@ -657,8 +663,8 @@ std::vector<GaplessExtension> GaplessExtender::extend(cluster_type& cluster, con
}
}

// If we have a good enough full-length alignment, return the best two sufficiently
// distinct full-length alingments.
// If we have a good enough full-length alignment, return the best sufficiently
// distinct full-length alignments.
if (best_alignment < result.size() && result[best_alignment].internal_score <= max_mismatches) {
handle_full_length(*cache, result, overlap_threshold);
find_mismatches(sequence, *cache, result);
Expand Down Expand Up @@ -689,6 +695,12 @@ std::vector<GaplessExtension> GaplessExtender::extend(cluster_type& cluster, con

//------------------------------------------------------------------------------

bool GaplessExtender::full_length_extensions(const std::vector<GaplessExtension>& result, size_t max_mismatches) {
return (result.size() > 0 && result.front().full() && result.front().mismatches() <= max_mismatches);
}

//------------------------------------------------------------------------------

struct state_hash {
size_t operator()(const gbwt::BidirectionalState& state) const {
size_t result = wang_hash_64(state.forward.node);
Expand Down
31 changes: 21 additions & 10 deletions src/gapless_extender.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,13 @@ struct GaplessExtension
return (this->score < another.score);
}

/// Two extensions are equal if the same read interval matches the same search state.
/// Two extensions are equal if the same read interval matches the same search state
/// with the same node offset.
bool operator==(const GaplessExtension& another) const {
return (this->read_interval == another.read_interval && this->state == another.state);
return (this->read_interval == another.read_interval && this->state == another.state && this->offset == another.offset);
}

/// Two extensions are not equal if the state or the read is different.
/// Two extensions are not equal if the state, the read interval, or the node offset is different.
bool operator!=(const GaplessExtension& another) const {
return !(this->operator==(another));
}
Expand All @@ -106,8 +107,7 @@ struct GaplessExtension
* is a pair of matching read/graph positions and each extension is a gapless alignment
* of an interval of the read to a haplotype.
* A cluster is an unordered set of distinct seeds. Seeds in the same node with the same
* (read_offset - node_offset) difference are considered equivalent. All seeds in a cluster
* should correspond to the same alignment or positions near it.
* (read_offset - node_offset) difference are considered equivalent.
* GaplessExtender also needs an Aligner object for scoring the extension candidates.
*/
class GaplessExtender {
Expand Down Expand Up @@ -158,18 +158,29 @@ class GaplessExtender {
/**
* Find the highest-scoring extension for each seed in the cluster.
* If there is a full-length extension with at most max_mismatches
* mismatches, return the (up to two) best full-length extensions with
* at most overlap_threshold overlap, sorted by score in descending
* order.
* If that is not possible, trim the extensions to maximize score,
* sort them by read interval, and remove duplicates.
* mismatches, sort them in descending order by score and return the
* best non-overlapping full-length extensions. Two extensions overlap
* if the fraction of identical base mappings is greater than
* overlap_threshold.
* If there are no good enough full-length extensions, trim the
* extensions to maximize the score and remove duplicates. In this
* case, the extensions are sorted by read interval.
* Use full_length_extensions() to determine the type of the returned
* extension set.
* Allow any number of mismatches in the initial node, at least
* max_mismatches mismatches in the entire extension, and at least
* max_mismatches / 2 mismatches on each flank.
* Use the provided CachedGBWTGraph or allocate a new one.
*/
std::vector<GaplessExtension> extend(cluster_type& cluster, const std::string& sequence, const gbwtgraph::CachedGBWTGraph* cache = nullptr, size_t max_mismatches = MAX_MISMATCHES, double overlap_threshold = OVERLAP_THRESHOLD) const;

/**
* Determine whether the extension set contains non-overlapping
* full-length extensions sorted in descending order by score. Use
* the same value of max_mismatches as in extend().
*/
static bool full_length_extensions(const std::vector<GaplessExtension>& result, size_t max_mismatches = MAX_MISMATCHES);

/**
* Find the distinct local haplotypes in the given subgraph and return the corresponding paths.
* For each path haplotype_paths[i], the output graph will contain node 2i + 1 with sequence
Expand Down
48 changes: 17 additions & 31 deletions src/minimizer_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,13 +320,7 @@ vector<Alignment> MinimizerMapper::map(Alignment& aln) {
Alignment best_alignment = aln;
Alignment second_best_alignment = aln;

// Determine if we got full-length alignments or local alignments.
bool full_length_extensions = (extensions.size() <= 2);
for (auto& extension : extensions) {
full_length_extensions &= extension.full();
}

if (full_length_extensions) {
if (GaplessExtender::full_length_extensions(extensions)) {
// We got full-length extensions, so directly convert to an Alignment.

if (track_provenance) {
Expand All @@ -342,7 +336,7 @@ vector<Alignment> MinimizerMapper::map(Alignment& aln) {

if (extensions.size() > 1) {
//Do the same thing for the second extension, if one exists
this->extension_to_alignment(extensions.back(), second_best_alignment);
this->extension_to_alignment(extensions[1], second_best_alignment);

#ifdef debug
cerr << "Produced additional alignment directly from full length gapless extension " << extension_num << endl;
Expand Down Expand Up @@ -1036,13 +1030,7 @@ pair<vector<Alignment>, vector< Alignment>> MinimizerMapper::map_paired(Alignmen
Alignment second_best_alignment = aln;


// Determine if we got full-length alignments or local alignments.
bool full_length_extensions = (extensions.size() <= 2);
for (auto& extension : extensions) {
full_length_extensions &= extension.full();
}

if (full_length_extensions) {
if (GaplessExtender::full_length_extensions(extensions)) {
// We got full-length extensions, so directly convert to an Alignment.

if (track_provenance) {
Expand All @@ -1058,7 +1046,7 @@ pair<vector<Alignment>, vector< Alignment>> MinimizerMapper::map_paired(Alignmen

if (extensions.size() > 1) {
//Do the same thing for the second extension, if one exists
this->extension_to_alignment(extensions.back(), second_best_alignment);
this->extension_to_alignment(extensions[1], second_best_alignment);

#ifdef debug
cerr << "Produced additional alignment directly from full length gapless extension " << extension_num << endl;
Expand Down Expand Up @@ -2149,16 +2137,10 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
// Find all seeds in the subgraph and try to get a full-length extension.
GaplessExtender::cluster_type seeds = this->seeds_in_subgraph(minimizers, rescue_nodes);
std::vector<GaplessExtension> extensions = this->extender.extend(seeds, rescued_alignment.sequence(), &cached_graph);
size_t best = extensions.size();
for (size_t i = 0; i < extensions.size(); i++) {
if (best >= extensions.size() || extensions[i].score > extensions[best].score) {
best = i;
}
}

// If we have a full-length extension, use it as the rescued alignment.
if (best < extensions.size() && extensions[best].full()) {
this->extension_to_alignment(extensions[best], rescued_alignment);
if (GaplessExtender::full_length_extensions(extensions)) {
this->extension_to_alignment(extensions.front(), rescued_alignment);
return;
}

Expand All @@ -2180,6 +2162,14 @@ void MinimizerMapper::attempt_rescue(const Alignment& aligned_read, Alignment& r
return;
}

// Determine the best extension.
size_t best = extensions.size();
for (size_t i = 0; i < extensions.size(); i++) {
if (best >= extensions.size() || extensions[i].score > extensions[best].score) {
best = i;
}
}

// Use the best extension as a seed for dozeu.
// Also ensure that the entire extension is in the subgraph.
std::vector<MaximalExactMatch> dozeu_seed;
Expand Down Expand Up @@ -2554,13 +2544,9 @@ int MinimizerMapper::score_extension_group(const Alignment& aln, const vector<Ga
if (extended_seeds.empty()) {
// TODO: We should never see an empty group of extensions
return 0;
} else if (extended_seeds.front().full()) {
// These are length matches. We already have the score.
int best_score = 0;
for (auto& extension : extended_seeds) {
best_score = max(best_score, extension.score);
}
return best_score;
} else if (GaplessExtender::full_length_extensions(extended_seeds)) {
// These are full-length matches. We already have the score.
return extended_seeds.front().score;
} else {
// This is a collection of one or more non-full-length extended seeds.

Expand Down

0 comments on commit aa10355

Please sign in to comment.