diff --git a/src/gbwt_extender.cpp b/src/gbwt_extender.cpp index c939823357c..8334961f77e 100644 --- a/src/gbwt_extender.cpp +++ b/src/gbwt_extender.cpp @@ -735,6 +735,7 @@ struct state_hash { bool WFAAlignment::unlocalized_insertion() const { return ( + this->ok && this->path.empty() && this->edits.size() == 1 && this->edits.front().first == insertion @@ -755,11 +756,11 @@ uint32_t WFAAlignment::final_offset(const gbwtgraph::GBWTGraph& graph) const { } void WFAAlignment::flip(const gbwtgraph::GBWTGraph& graph, const std::string& sequence) { + this->seq_offset = sequence.length() - this->seq_offset - this->length; + if (this->path.empty()) { return; } - - this->seq_offset = sequence.length() - this->seq_offset - this->length; this->node_offset = graph.get_length(this->path.back()) - this->final_offset(graph); // Reverse the path and the edits. @@ -996,6 +997,9 @@ class WFATree { // The overall closed range of diagonals reached. std::pair max_diagonals; + // TODO: Remove when unnecessary. + bool debug; + WFATree(const gbwtgraph::GBWTGraph& graph, const std::string& sequence, const gbwt::SearchState& root, uint32_t node_offset, const Aligner& aligner) : graph(graph), sequence(sequence), nodes(), @@ -1004,7 +1008,8 @@ class WFATree { gap_open(2 * (aligner.gap_open - aligner.gap_extension)), gap_extend(2 * aligner.gap_extension + aligner.match), score_bound(0), - possible_scores(), max_diagonals(0, 0) + possible_scores(), max_diagonals(0, 0), + debug(false) { this->nodes.emplace_back(root, 0); this->nodes.front().update(WFANode::MATCHES, 0, 0, 0, node_offset); @@ -1103,6 +1108,7 @@ class WFATree { if (!subst.empty()) { subst.seq_offset++; this->successor_offset(subst); + this->expand_if_necessary(subst); } // Determine the edit that reaches furthest on the diagonal. @@ -1134,7 +1140,6 @@ class WFATree { } } this->nodes[subst.node()].update(WFANode::MATCHES, score, diagonal, subst); - this->expand_if_necessary(subst); } } } @@ -1363,7 +1368,7 @@ class WFATree { //------------------------------------------------------------------------------ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) const { - if (this->graph == nullptr || this->aligner == nullptr || sequence.empty()) { + if (this->graph == nullptr || this->aligner == nullptr) { return WFAAlignment(); } gbwt::SearchState root_state = this->graph->get_state(this->graph->get_handle(id(from), is_rev(from))); @@ -1373,6 +1378,7 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co this->mask(sequence); WFATree tree(*(this->graph), sequence, root_state, offset(from) + 1, *(this->aligner)); + tree.debug = this->debug; int32_t score = 0; while (true) { @@ -1401,15 +1407,13 @@ WFAAlignment WFAExtender::connect(std::string sequence, pos_t from, pos_t to) co return WFAAlignment(); } } - if (tree.candidate_point.seq_offset == 0) { - return WFAAlignment(); - } // Start building an alignment. Store the path first. WFAAlignment result { {}, {}, static_cast(offset(from) + 1), 0, tree.candidate_point.seq_offset + unaligned_tail, - tree.candidate_point.alignment_score(*(this->aligner), unaligned_tail) + tree.candidate_point.alignment_score(*(this->aligner), unaligned_tail), + true }; uint32_t node = tree.candidate_node; while (true) { diff --git a/src/gbwt_extender.hpp b/src/gbwt_extender.hpp index 327751e815a..eba41c9dcff 100644 --- a/src/gbwt_extender.hpp +++ b/src/gbwt_extender.hpp @@ -239,8 +239,15 @@ struct WFAAlignment { /// Alignment score. int32_t score = 0; - /// Returns true if the alignment is empty. - bool empty() const { return (this->length == 0); } + /// Is this an actual alignment or a failure? + bool ok = false; + + /// Is this an actual alignment or a failure? + operator bool() const { return this->ok; } + + /// Returns true if the alignment does not cover anything in the sequence + /// and the graph. + bool empty() const { return (this->path.empty() && this->edits.empty()); } /// Returns true if the alignment is an insertion without a location. bool unlocalized_insertion() const; @@ -280,7 +287,7 @@ struct WFAAlignment { * * VG scores a gap of length `n` as `gap_open + (n - 1) * gap_extend`, while WFA * papers use `gap_open + n * gap_extend`. Hence we use `gap_open - gap_extend` as - * the effective four-parameter gap open score inside this aligner. + * the effective four-parameter gap open score inside the aligner. * * NOTE: Most internal arithmetic operations use 32-bit integers. */ @@ -300,14 +307,17 @@ class WFAExtender { * The endpoints are assumed to be valid graph positions. In order for * there to be an alignment, there must be a haplotype that includes the * endpoints and connects them. However, the endpoints are not covered - * by the returned alignment. + * by the returned alignment. * * The sequence that will be aligned is passed by value. All non-ACGT * characters are masked with character X, which should not match any * character in the graph. * - * Returns an empty alignment if there is no alignment with an acceptable + * Returns a failed alignment if there is no alignment with an acceptable * score. + * + * NOTE: The alignment is to a path after `from` and before `to`. If the + * points are identical, such a path can only exist if there is a cycle. */ WFAAlignment connect(std::string sequence, pos_t from, pos_t to) const; @@ -315,10 +325,10 @@ class WFAExtender { * A special case of connect() for aligning the sequence to a haplotype * starting at the given position. If there is no alignment for the * entire sequence with an acceptable score, returns the highest-scoring - * partial alignment. + * partial alignment, which may be empty. * - * NOTE: This creates a suffix of the alignment by aligning a prefix of - * the sequence. + * NOTE: This creates a suffix of the full alignment by aligning a + * prefix of the sequence. * TODO: Should we use full-length bonuses? */ WFAAlignment suffix(const std::string& sequence, pos_t from) const; @@ -327,10 +337,10 @@ class WFAExtender { * A special case of connect() for aligning the sequence to a haplotype * ending at the given position. If there is no alignment for the entire * sequence with an acceptable score, returns the highest-scoring partial - * alignment. + * alignment, which may be empty. * - * NOTE: This creates a prefix of the alignment by aligning a suffix of - * the sequence. + * NOTE: This creates a prefix of the full alignment by aligning a suffix + * of the sequence. * TODO: Should we use full-length bonuses? */ WFAAlignment prefix(const std::string& sequence, pos_t to) const; diff --git a/src/unittest/gbwt_extender.cpp b/src/unittest/gbwt_extender.cpp index b1f6b69e7e9..3ff7d347506 100644 --- a/src/unittest/gbwt_extender.cpp +++ b/src/unittest/gbwt_extender.cpp @@ -922,6 +922,40 @@ gbwtgraph::GBWTGraph wfa_general_graph(const gbwt::GBWT& index) { return gbwtgraph::GBWTGraph(index, source); } +gbwt::GBWT wfa_cycle_gbwt() { + std::vector paths; + + // Path that skips the cycle + paths.emplace_back(); + paths.back().push_back(gbwt::Node::encode(1, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(3, false)); + + // Path that takes the cycle once. + paths.emplace_back(); + paths.back().push_back(gbwt::Node::encode(1, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(3, false)); + + // Path that takes the cycle twice. + paths.emplace_back(); + paths.back().push_back(gbwt::Node::encode(1, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(2, false)); + paths.back().push_back(gbwt::Node::encode(3, false)); + + return get_gbwt(paths); +} + +gbwtgraph::GBWTGraph wfa_cycle_graph(const gbwt::GBWT& index) { + gbwtgraph::SequenceSource source; + source.add_node(1, "CGC"); + source.add_node(2, "GA"); + source.add_node(3, "TAT"); + return gbwtgraph::GBWTGraph(index, source); +} void check_score(const WFAAlignment& alignment, const Aligner& aligner, int32_t matches, int32_t mismatches, int32_t gaps, int32_t gap_length) { int32_t extensions = gap_length - gaps; @@ -947,6 +981,8 @@ void check_unlocalized_insertion(const WFAAlignment& alignment, const std::strin void check_alignment(const WFAAlignment& alignment, const std::string& sequence, const gbwtgraph::GBWTGraph& graph, const Aligner& aligner, const pos_t* from, const pos_t* to) { + REQUIRE(alignment); + // Sequence range is sane and corresponds to the specified endpoints. REQUIRE(alignment.seq_offset + alignment.length <= sequence.length()); REQUIRE((from == nullptr) | (alignment.seq_offset == 0)); @@ -970,28 +1006,30 @@ void check_alignment(const WFAAlignment& alignment, const std::string& sequence, } // Check that the alignment is between the right positions, if provided, and the start/end nodes are used in the alignment. - REQUIRE(alignment.node_offset < graph.get_length(alignment.path.front())); - if (from != nullptr) { - handle_t from_handle = graph.get_handle(id(*from), is_rev(*from)); - if (alignment.path.front() == from_handle && alignment.node_offset > 0) { - REQUIRE(alignment.node_offset == offset(*from) + 1); - } else { - REQUIRE(offset(*from) + 1 == graph.get_length(from_handle)); - REQUIRE(graph.has_edge(from_handle, alignment.path.front())); - REQUIRE(alignment.node_offset == 0); + if (!alignment.path.empty()) { + REQUIRE(alignment.node_offset < graph.get_length(alignment.path.front())); + if (from != nullptr) { + handle_t from_handle = graph.get_handle(id(*from), is_rev(*from)); + if (alignment.path.front() == from_handle && alignment.node_offset > 0) { + REQUIRE(alignment.node_offset == offset(*from) + 1); + } else { + REQUIRE(offset(*from) + 1 == graph.get_length(from_handle)); + REQUIRE(graph.has_edge(from_handle, alignment.path.front())); + REQUIRE(alignment.node_offset == 0); + } } - } - uint32_t final_offset = alignment.final_offset(graph); - REQUIRE(final_offset > 0); - if (to != nullptr) { - handle_t to_handle = graph.get_handle(id(*to), is_rev(*to)); - uint32_t final_node_length = graph.get_length(alignment.path.back()); - if (alignment.path.back() == to_handle && final_offset < final_node_length) { - REQUIRE(final_offset == offset(*to)); - } else { - REQUIRE(offset(*to) == 0); - REQUIRE(graph.has_edge(alignment.path.back(), to_handle)); - REQUIRE(final_offset == final_node_length); + uint32_t final_offset = alignment.final_offset(graph); + REQUIRE(final_offset > 0); + if (to != nullptr) { + handle_t to_handle = graph.get_handle(id(*to), is_rev(*to)); + uint32_t final_node_length = graph.get_length(alignment.path.back()); + if (alignment.path.back() == to_handle && final_offset < final_node_length) { + REQUIRE(final_offset == offset(*to)); + } else { + REQUIRE(offset(*to) == 0); + REQUIRE(graph.has_edge(alignment.path.back(), to_handle)); + REQUIRE(final_offset == final_node_length); + } } } @@ -1051,7 +1089,9 @@ void check_alignment(const WFAAlignment& alignment, const std::string& sequence, } } } - REQUIRE((path_offset == alignment.path.size() - 1) | (path_offset == alignment.path.size() && node_offset == 0)); + if (!alignment.path.empty()) { + REQUIRE((path_offset == alignment.path.size() - 1) | (path_offset == alignment.path.size() && node_offset == 0)); + } } } // anonymous namespace @@ -1339,11 +1379,44 @@ TEST_CASE("Special cases in a linear graph", "[wfa_extender]") { Aligner aligner; WFAExtender extender(graph, aligner); - SECTION("Empty sequence") { + SECTION("Empty sequence, deletion") { + // DD std::string sequence; pos_t from(2, false, 0); pos_t to(2, false, 3); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + check_score(result, aligner, sequence.length(), 0, 1, 2); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Empty sequence, failure") { + // DDDDDD|DDDD + std::string sequence; + pos_t from(2, false, 0); pos_t to(3, false, 4); + WFAAlignment result = extender.connect(sequence, from, to); + REQUIRE_FALSE(result); + } + + SECTION("Identical endpoints") { + std::string sequence; + pos_t from(2, false, 3); pos_t to(2, false, 3); + WFAAlignment result = extender.connect(sequence, from, to); + REQUIRE_FALSE(result); + } + + SECTION("Adjacent endpoints, empty sequence") { + std::string sequence; + pos_t from(2, false, 6); pos_t to(3, false, 0); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Adjacent endpoints, insertion") { + std::string sequence("AA"); + pos_t from(2, false, 6); pos_t to(3, false, 0); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length() - 2, 0, 1, 2); + check_alignment(result, sequence, graph, aligner, &from, &to); } SECTION("Cannot align within the score bound") { @@ -1351,14 +1424,14 @@ TEST_CASE("Special cases in a linear graph", "[wfa_extender]") { std::string sequence("GAGGAGA"); pos_t from(1, false, 2); pos_t to(3, false, 0); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + REQUIRE_FALSE(result); } SECTION("Cannot reach target") { std::string sequence("GATTA"); pos_t from(3, false, 0); pos_t to(2, false, 6); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + REQUIRE_FALSE(result); } SECTION("Run out of graph") { @@ -1366,7 +1439,7 @@ TEST_CASE("Special cases in a linear graph", "[wfa_extender]") { std::string sequence("ATCCCC"); pos_t from(4, false, 0); pos_t to(5, false, 3); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + REQUIRE_FALSE(result); } SECTION("Mixed edits") { @@ -1401,7 +1474,8 @@ TEST_CASE("Prefixes in a linear graph", "[wfa_extender]") { std::string sequence; pos_t to(2, false, 1); WFAAlignment result = extender.prefix(sequence, to); - REQUIRE(result.empty()); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, nullptr, &to); } SECTION("Got an unlocalized insertion") { @@ -1417,7 +1491,8 @@ TEST_CASE("Prefixes in a linear graph", "[wfa_extender]") { std::string sequence("GGGGGG"); pos_t to(3, false, 0); WFAAlignment result = extender.prefix(sequence, to); - REQUIRE(result.empty()); + check_score(result, aligner, 0, 0, 0, 0); + check_alignment(result, sequence, graph, aligner, nullptr, &to); } SECTION("Exact match, middle of a node") { @@ -1502,7 +1577,8 @@ TEST_CASE("Suffixes in a linear graph", "[wfa_extender]") { std::string sequence; pos_t from(2, false, 1); WFAAlignment result = extender.suffix(sequence, from); - REQUIRE(result.empty()); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, nullptr); } SECTION("Got an unlocalized insertion") { @@ -1518,7 +1594,8 @@ TEST_CASE("Suffixes in a linear graph", "[wfa_extender]") { std::string sequence("GGGGGG"); pos_t from(2, false, 0); WFAAlignment result = extender.suffix(sequence, from); - REQUIRE(result.empty()); + check_score(result, aligner, 0, 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, nullptr); } SECTION("Exact match, middle of a node") { @@ -1641,7 +1718,15 @@ TEST_CASE("Connect in a general graph", "[wfa_extender]") { std::string sequence("GGAAGT"); pos_t from(6, false, 0); pos_t to(10, false, 2); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + REQUIRE_FALSE(result); + } + + SECTION("No haplotype connection that includes the endpoints") { + // Matches node 8. + std::string sequence("GAA"); + pos_t from(6, false, 1); pos_t to(10, false, 0); + WFAAlignment result = extender.connect(sequence, from, to); + REQUIRE_FALSE(result); } SECTION("Not within score bound") { @@ -1649,7 +1734,7 @@ TEST_CASE("Connect in a general graph", "[wfa_extender]") { std::string sequence("CCATTACATAGGA"); pos_t from(1, false, 1); pos_t to(6, false, 0); WFAAlignment result = extender.connect(sequence, from, to); - REQUIRE(result.empty()); + REQUIRE_FALSE(result); } } @@ -1785,5 +1870,83 @@ TEST_CASE("Suffix in a general graph", "[wfa_extender]") { //------------------------------------------------------------------------------ +TEST_CASE("Connect with a cycle", "[wfa_extender]") { + // CGC GA TAT + // CGC GA GA TAT + // CGC GA GA GA TAT + gbwt::GBWT index = wfa_cycle_gbwt(); + gbwtgraph::GBWTGraph graph = wfa_cycle_graph(index); + Aligner aligner; + WFAExtender extender(graph, aligner); + + SECTION("Skip the cycle") { + std::string sequence("CGAT"); + pos_t from(1, false, 1); pos_t to(3, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Take it once") { + std::string sequence("CGAGAT"); + pos_t from(1, false, 1); pos_t to(3, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Take it twice") { + std::string sequence("CGAGAGAT"); + pos_t from(1, false, 1); pos_t to(3, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Take it thrice, insertion") { + std::string sequence("CGAGAGAGAT"); + pos_t from(1, false, 1); pos_t to(3, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length() - 2, 0, 1, 2); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Take it five times, fail") { + std::string sequence("CGAGAGAGAGAGAT"); + pos_t from(1, false, 1); pos_t to(3, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + REQUIRE_FALSE(result); + } + + SECTION("Reach the end twice") { + // MM|MM|M + std::string sequence("GCGAG"); + pos_t from(1, false, 0); pos_t to(2, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length(), 0, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Identical endpoints, at start") { + // M|MX; correct endpoint on the third visit to the node + std::string sequence("AGG"); + pos_t from(2, false, 0); pos_t to(2, false, 0); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length() - 1, 1, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } + + SECTION("Identical endpoints, at end") { + // MM|X; correct endpoint on the third visit to the node + std::string sequence("GAT"); + pos_t from(2, false, 1); pos_t to(2, false, 1); + WFAAlignment result = extender.connect(sequence, from, to); + check_score(result, aligner, sequence.length() - 1, 1, 0, 0); + check_alignment(result, sequence, graph, aligner, &from, &to); + } +} + +//------------------------------------------------------------------------------ + } }