Skip to content

Commit

Permalink
Do not include the endpoints in WFA alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
jltsiren committed Jun 14, 2022
1 parent 526f8f1 commit 69ffe4d
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 107 deletions.
77 changes: 47 additions & 30 deletions src/gbwt_extender.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -844,13 +844,6 @@ struct MatchPos {
}
return (this->seq_offset < another.seq_offset);
}

// Returns the non-empty position further on the diagonal (on the sequence).
static const MatchPos& max(const MatchPos& a, const MatchPos& b) {
if (a.empty()) { return b; }
if (b.empty()) { return a; }
return (a.seq_offset > b.seq_offset ? a : b);
}
};

// A point in an WFA score matrix (for a specific node).
Expand Down Expand Up @@ -980,7 +973,7 @@ class WFATree {
std::vector<WFANode> nodes;

// Best alignment found so far. If we reached the destination in the graph,
// the score includes the possible insertion at the end but the point itself
// the score includes the implicit insertion at the end but the point itself
// does not.
WFAPoint candidate_point;
uint32_t candidate_node;
Expand Down Expand Up @@ -1030,9 +1023,14 @@ class WFATree {
static bool is_root(uint32_t node) { return (node == 0); }
uint32_t parent(uint32_t node) const { return this->nodes[node].parent; }

// Assumes length > 0.
int32_t gap_extend_penalty(uint32_t length) const {
return static_cast<int32_t>(length) * this->gap_extend;
}

// Assumes length > 0.
int32_t gap_penalty(uint32_t length) const {
return this->gap_open + static_cast<int32_t>(length) * this->gap_extend;
return this->gap_open + this->gap_extend_penalty(length);
}

// wf_extend() in the paper.
Expand Down Expand Up @@ -1106,21 +1104,32 @@ class WFATree {
subst.seq_offset++;
this->successor_offset(subst);
}
subst = MatchPos::max(subst, ins);
subst = MatchPos::max(subst, del);

// Determine the edit that reaches furthest on the diagonal.
bool is_insertion = false;
if (subst < ins) {
subst = std::move(ins);
is_insertion = true;
}
if (subst < del) {
subst = std::move(del);
is_insertion = false;
}

if (!subst.empty()) {
// If we are just past the end position, we get a candidate alignment by
// assuming that the rest of the sequence is an insertion.
// If `subst` is an insertion, we already got a better alignment from the
// match preceding it.
if (this->nodes[subst.node()].same_node(to) && subst.node_offset == offset(to) + 1) {
// If we reached the end position with the edit, we get a candidate
// alignment by assuming that the rest of the sequence is an insertion.
// If the edit is an insertion, we charge the gap open cost again, but
// we already got the same insertion without the extra cost from the
// match preceding the insertion.
if (this->nodes[subst.node()].same_node(to) && subst.node_offset == offset(to)) {
uint32_t gap_length = this->sequence.length() - subst.seq_offset;
int32_t new_score = score;
int32_t gap_score = 0;
if (gap_length > 0) {
new_score += this->gap_penalty(gap_length);
gap_score = this->gap_penalty(gap_length);
}
if (new_score < this->candidate_point.score) {
this->candidate_point = { new_score, diagonal, subst.seq_offset, subst.node_offset };
if (score + gap_score < this->candidate_point.score) {
this->candidate_point = { score + gap_score, diagonal, subst.seq_offset, subst.node_offset };
this->candidate_node = subst.node();
}
}
Expand Down Expand Up @@ -1221,18 +1230,18 @@ class WFATree {
while (true) {
bool may_reach_to = this->nodes[pos.node()].same_node(to) & (pos.node_offset <= offset(to));
this->nodes[pos.node()].match_forward(this->sequence, this->graph, pos);
// We got a match that covers the end position but may extend past it.
// We got a match that reached the end or went past it.
// Alternatively there is no end position and we have aligned the entire sequence.
// This gives us a candidate where the rest of the sequence is an insertion.
if ((may_reach_to && pos.node_offset > offset(to)) || (no_pos(to) && pos.seq_offset >= this->sequence.length())) {
uint32_t overshoot = (no_pos(to) ? 0 : pos.node_offset - offset(to) - 1);
if ((may_reach_to && pos.node_offset >= offset(to)) || (no_pos(to) && pos.seq_offset >= this->sequence.length())) {
uint32_t overshoot = (no_pos(to) ? 0 : pos.node_offset - offset(to));
uint32_t gap_length = (this->sequence.length() - pos.seq_offset) + overshoot;
int32_t new_score = score;
int32_t gap_score = 0;
if (gap_length > 0) {
new_score += this->gap_penalty(gap_length);
gap_score = this->gap_penalty(gap_length);
}
if (new_score < this->candidate_point.score) {
this->candidate_point = { new_score, diagonal, pos.seq_offset - overshoot, static_cast<uint32_t>(offset(to) + 1) };
if (score + gap_score < this->candidate_point.score) {
this->candidate_point = { score + gap_score, diagonal, pos.seq_offset - overshoot, static_cast<uint32_t>(offset(to)) };
this->candidate_node = pos.node();
}
}
Expand Down Expand Up @@ -1363,7 +1372,7 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co
}
this->mask(sequence);

WFATree tree(*(this->graph), sequence, root_state, offset(from), *(this->aligner));
WFATree tree(*(this->graph), sequence, root_state, offset(from) + 1, *(this->aligner));

int32_t score = 0;
while (true) {
Expand Down Expand Up @@ -1398,7 +1407,7 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co

// Start building an alignment. Store the path first.
WFAAlignment result {
{}, {}, static_cast<uint32_t>(offset(from)), 0,
{}, {}, static_cast<uint32_t>(offset(from) + 1), 0,
tree.candidate_point.seq_offset + unaligned_tail,
tree.candidate_point.alignment_score(*(this->aligner), unaligned_tail)
};
Expand All @@ -1418,7 +1427,7 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co
if (unaligned_tail > 0) {
uint32_t final_insertion = sequence.length() - tree.candidate_point.seq_offset;
result.append(WFAAlignment::insertion, final_insertion);
point.score -= tree.gap_penalty(final_insertion);
point.score -= tree.gap_penalty(unaligned_tail);
}

// Backtrace the edits.
Expand Down Expand Up @@ -1472,6 +1481,14 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co
}
std::reverse(result.edits.begin(), result.edits.end());

// We used "from + 1" as the starting position for the alignment. That could have
// been a past-the-end position in the initial node. Once we have an actual path
// instead of a tree of potential paths, we can remove the unused node.
if (!result.path.empty() && result.node_offset >= this->graph->get_length(result.path.front())) {
result.path.erase(result.path.begin());
result.node_offset = 0;
}

// Due to the way we expand the tree of GBWT search states and store wavefront
// information in the leaves, we sometimes do not use any bases in the final node.
// We deal with this now to avoid facing the issue later.
Expand Down
5 changes: 5 additions & 0 deletions src/gbwt_extender.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,11 @@ class WFAExtender {
/**
* Align the sequence to a haplotype between the two graph positions.
*
* The endpoints are assumed to be valid graph positions. In order for
* there to be an alignment, there must be a haplotype that includes the
* endpoints and connects them. However, the endpoints are not covered
* by the returned alignment.
*
* The sequence that will be aligned is passed by value. All non-ACGT
* characters are masked with character X, which should not match any
* character in the graph.
Expand Down
Loading

0 comments on commit 69ffe4d

Please sign in to comment.