diff --git a/src/augment.cpp b/src/augment.cpp index 69ce256e57d..12d0eb35dc7 100644 --- a/src/augment.cpp +++ b/src/augment.cpp @@ -15,7 +15,7 @@ void augment(MutablePathMutableHandleGraph* graph, istream& gam_stream, vector* out_translations, ostream* gam_out_stream, - function save_path_fn, + bool embed_paths, bool break_at_ends, bool remove_softclips) { @@ -32,7 +32,7 @@ void augment(MutablePathMutableHandleGraph* graph, iterate_gam, out_translations, gam_out_stream, - save_path_fn, + embed_paths, break_at_ends, remove_softclips); } @@ -41,7 +41,7 @@ void augment(MutablePathMutableHandleGraph* graph, vector& path_vector, vector* out_translations, ostream* gam_out_stream, - function save_path_fn, + bool embed_paths, bool break_at_ends, bool remove_softclips) { @@ -59,7 +59,7 @@ void augment(MutablePathMutableHandleGraph* graph, iterate_gam, out_translations, gam_out_stream, - save_path_fn, + embed_paths, break_at_ends, remove_softclips); } @@ -68,7 +68,7 @@ void augment_impl(MutablePathMutableHandleGraph* graph, function, bool)> iterate_gam, vector* out_translations, ostream* gam_out_stream, - function save_path_fn, + bool embed_paths, bool break_at_ends, bool remove_softclips) { // Collect the breakpoints @@ -138,8 +138,8 @@ void augment_impl(MutablePathMutableHandleGraph* graph, // Copy over the name *added.mutable_name() = aln.name(); - if (save_path_fn) { - save_path_fn(added); + if (embed_paths) { + add_path_to_graph(graph, added); } // something is off about this check. @@ -198,6 +198,19 @@ void augment_impl(MutablePathMutableHandleGraph* graph, if (out_translations != nullptr) { *out_translations = make_translation(graph, node_translation, added_nodes, orig_node_sizes); } + + VG* vg_graph = dynamic_cast(graph); + + // This code got run after augment in VG::edit, so we make sure it happens here too + if (vg_graph != nullptr) { + // Rebuild path ranks, aux mapping, etc. by compacting the path ranks + // Todo: can we just do this once? + vg_graph->paths.compact_ranks(); + + // execute a semi partial order sort on the nodes + vg_graph->sort(); + } + } @@ -297,6 +310,15 @@ void find_breakpoints(const Path& path, unordered_map>& breakpo } +path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path) { + path_handle_t path_handle = graph->create_path_handle(path.name(), path.is_circular()); + for (int i = 0; i < path.mapping_size(); ++i) { + graph->append_step(path_handle, graph->get_handle(path.mapping(i).position().node_id(), + path.mapping(i).position().is_reverse())); + } + return path_handle; +} + unordered_map> forwardize_breakpoints(const HandleGraph* graph, const unordered_map>& breakpoints) { unordered_map> fwd; diff --git a/src/augment.hpp b/src/augment.hpp index 7452faca786..f33902f0fec 100644 --- a/src/augment.hpp +++ b/src/augment.hpp @@ -24,7 +24,7 @@ using namespace std; /// If out_translation is not null, a list of translations, one per node existing /// after the edit, describing /// how each new or conserved node is embedded in the old graph. -/// if save_path_fn is not null, this function will be run on each modified path +/// if embed_paths is true, then the augmented alignemnents will be saved as embededed paths in the graph /// in order to add it back to the graph. /// If break_at_ends is true, nodes will be broken at /// the ends of paths that start/end woth perfect matches, so the paths can @@ -35,7 +35,7 @@ void augment(MutablePathMutableHandleGraph* graph, istream& gam_stream, vector* out_translation = nullptr, ostream* gam_out_stream = nullptr, - function save_path_fn = nullptr, + bool embed_paths = false, bool break_at_ends = false, bool remove_soft_clips = false); @@ -45,7 +45,7 @@ void augment(MutablePathMutableHandleGraph* graph, vector& path_vector, vector* out_translation = nullptr, ostream* gam_out_stream = nullptr, - function save_path_fn = nullptr, + bool embed_paths = false, bool break_at_ends = false, bool remove_soft_clips = false); @@ -54,10 +54,14 @@ void augment_impl(MutablePathMutableHandleGraph* graph, function, bool)> iterate_gam, vector* out_translation, ostream* gam_out_stream, - function save_path_fn, + bool embed_paths, bool break_at_ends, bool remove_soft_clips); +/// Add a path to the graph. This is like VG::extend, and expects +/// a path with no edits, and for all the nodes and edges in the path +/// to exist exactly in the graph +path_handle_t add_path_to_graph(MutablePathHandleGraph* graph, const Path& path); /// Find all the points at which a Path enters or leaves nodes in the graph. Adds /// them to the given map by node ID of sets of bases in the node that will need diff --git a/src/io/register_loader_saver_hash_graph.cpp b/src/io/register_loader_saver_hash_graph.cpp index 837d145880e..f2d4b1fdc10 100644 --- a/src/io/register_loader_saver_hash_graph.cpp +++ b/src/io/register_loader_saver_hash_graph.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_hash_graph() { - Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("HashGraph", [](istream& input) -> void* { // Allocate a HashGraph bdsg::HashGraph* hash_graph = new bdsg::HashGraph(); diff --git a/src/io/register_loader_saver_odgi.cpp b/src/io/register_loader_saver_odgi.cpp index ebc2c5e118b..0bf21f22c86 100644 --- a/src/io/register_loader_saver_odgi.cpp +++ b/src/io/register_loader_saver_odgi.cpp @@ -17,9 +17,9 @@ using namespace std; using namespace vg::io; void register_loader_saver_odgi() { - Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { // Allocate a PackedGraph - bdsg::ODGI* odgi = new bdsg::ODGI(); + bdsg::ODGI* odgi = new bdsg::ODGI(); // Load it odgi->load(input); diff --git a/src/io/register_loader_saver_packed_graph.cpp b/src/io/register_loader_saver_packed_graph.cpp index 8ee650d19c9..510ead17723 100644 --- a/src/io/register_loader_saver_packed_graph.cpp +++ b/src/io/register_loader_saver_packed_graph.cpp @@ -17,7 +17,7 @@ using namespace std; using namespace vg::io; void register_loader_saver_packed_graph() { - Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { + Registry::register_bare_loader_saver("PackedGraph", [](istream& input) -> void* { // Allocate a PackedGraph bdsg::PackedGraph* packed_graph = new bdsg::PackedGraph(); diff --git a/src/io/register_loader_saver_vg.cpp b/src/io/register_loader_saver_vg.cpp index 8e7307e6f47..a1554008df8 100644 --- a/src/io/register_loader_saver_vg.cpp +++ b/src/io/register_loader_saver_vg.cpp @@ -17,7 +17,7 @@ using namespace vg::io; void register_loader_saver_vg() { // We register for "" so we can handle untagged old-style vg files and make them into HandleGraphs - Registry::register_loader_saver(vector{"VG", ""}, + Registry::register_loader_saver(vector{"VG", ""}, [](const message_sender_function_t& for_each_message) -> void* { // We have a bit of a control problem. // The source function wants to drive; we give it a function of strings, and it calls it with all the strings in turn. diff --git a/src/pileup.cpp b/src/pileup.cpp deleted file mode 100644 index eb08ce30822..00000000000 --- a/src/pileup.cpp +++ /dev/null @@ -1,864 +0,0 @@ -#include -#include -#include -#include "json2pb.h" -#include "pileup.hpp" -#include - -//#define debug - -using namespace std; - -namespace vg { - -void Pileups::clear() { - for (auto& p : _node_pileups) { - delete p.second; - } - _node_pileups.clear(); - - for (auto& p : _edge_pileups) { - delete p.second; - } - _edge_pileups.clear(); - _min_quality_count = 0; - _max_mismatch_count = 0; - _bases_count = 0; -} - -void Pileups::to_json(ostream& out) { - out << "{\"node_pileups\": ["; - for (NodePileupHash::iterator i = _node_pileups.begin(); i != _node_pileups.end();) { - out << pb2json(*i->second); - ++i; - if (i != _node_pileups.end()) { - out << ","; - } - } - out << "]," << endl << "\"edge_pileups\": ["; - for (EdgePileupHash::iterator i = _edge_pileups.begin(); i != _edge_pileups.end();) { - out << pb2json(*i->second); - ++i; - if (i != _edge_pileups.end()) { - out << ","; - } - } - out << "]}" << endl; -} - -void Pileups::load(istream& in) { - function lambda = [this](Pileup& pileup) { - extend(pileup); - }; - vg::io::for_each(in, lambda); -} - -void Pileups::write(ostream& out, size_t chunk_size) { - - int64_t count = max(_node_pileups.size(), _edge_pileups.size()) / chunk_size; - if (max(_node_pileups.size(), _edge_pileups.size()) % chunk_size != 0) { - ++count; - } - - NodePileupHash::iterator node_it = _node_pileups.begin(); - EdgePileupHash::iterator edge_it = _edge_pileups.begin(); - Pileup pileup; - - // note: this won't work at all in parallel but write is single threaded... - function lambda = [&](size_t i) -> Pileup& { - pileup.clear_node_pileups(); - pileup.clear_edge_pileups(); - for (size_t j = 0; j < chunk_size && node_it != _node_pileups.end(); ++j, ++node_it) { - NodePileup* np = pileup.add_node_pileups(); - *np = *node_it->second; - } - // unlike for Graph, we don't bother to try to group edges with nodes they attach - for (size_t j = 0; j < chunk_size && edge_it != _edge_pileups.end(); ++j, ++edge_it) { - EdgePileup* ep = pileup.add_edge_pileups(); - *ep = *edge_it->second; - } - - return pileup; - }; - - vg::io::write(out, count, lambda); - vg::io::finish(out); -} - -void Pileups::for_each_node_pileup(const function& lambda) { - for (auto& p : _node_pileups) { - lambda(*p.second); - } -} - -void Pileups::for_each_edge_pileup(const function& lambda) { - for (auto& p : _edge_pileups) { - lambda(*p.second); - } -} - -EdgePileup* Pileups::get_edge_pileup(pair sides) { - if (sides.second < sides.first) { - std::swap(sides.first, sides.second); - } - auto p = _edge_pileups.find(sides); - return p != _edge_pileups.end() ? p->second : NULL; -} - -// get a pileup. if it's null, create a new one and insert it. -EdgePileup* Pileups::get_create_edge_pileup(pair sides) { - if (sides.second < sides.first) { - std::swap(sides.first, sides.second); - } - EdgePileup* p = get_edge_pileup(sides); - if (p == NULL) { - p = new EdgePileup(); - p->mutable_edge()->set_from(sides.first.node); - p->mutable_edge()->set_from_start(!sides.first.is_end); - p->mutable_edge()->set_to(sides.second.node); - p->mutable_edge()->set_to_end(sides.second.is_end); - _edge_pileups[sides] = p; - } - return p; -} - - -void Pileups::extend(Pileup& pileup) { - for (int i = 0; i < pileup.node_pileups_size(); ++i) { - insert_node_pileup(new NodePileup(pileup.node_pileups(i))); - } - for (int i = 0; i < pileup.edge_pileups_size(); ++i) { - insert_edge_pileup(new EdgePileup(pileup.edge_pileups(i))); - } -} - -bool Pileups::insert_node_pileup(NodePileup* pileup) { - NodePileup* existing = get_node_pileup(pileup->node_id()); - if (existing != NULL) { - merge_node_pileups(*existing, *pileup); - delete pileup; - } else { - _node_pileups[pileup->node_id()] = pileup; - } - return existing == NULL; -} - -bool Pileups::insert_edge_pileup(EdgePileup* pileup) { - EdgePileup* existing = get_edge_pileup(NodeSide::pair_from_edge(*pileup->mutable_edge())); - if (existing != NULL) { - merge_edge_pileups(*existing, *pileup); - delete pileup; - } else { - _edge_pileups[NodeSide::pair_from_edge(*pileup->mutable_edge())] = pileup; - } - return existing == NULL; -} - -void Pileups::compute_from_alignment(Alignment& alignment) { - const Path& path = alignment.path(); - int64_t read_offset = 0; - vector mismatch_counts; - count_mismatches(*_graph, path, mismatch_counts); - // element i = location of rank i in the mapping array - vector ranks(path.mapping_size() + 1, -1); - // keep track of read offset of mapping array element i - vector in_read_offsets(path.mapping_size()); - vector out_read_offsets(path.mapping_size()); - // keep track of last mapping, offset of match, and open deletion for - // calling deletion endpoints (which are beside, but not on the base offsets they get written to) - pair last_match(NULL, -1); - pair last_del(NULL, -1); - pair open_del(NULL, -1); - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - int rank = mapping.rank() <= 0 ? i + 1 : mapping.rank(); - if (_graph->has_node(mapping.position().node_id())) { - const Node* node = _graph->get_node(mapping.position().node_id()); - NodePileup* pileup = get_create_node_pileup(node); - int64_t node_offset = mapping.position().offset(); - // utilize forward-relative node offset (old way), which - // is not consistent with current protobuf. conversion here. - if (mapping.position().is_reverse()) { - node_offset = node->sequence().length() - 1 - node_offset; - } - // If we mismatch alignments and graphs, we can get into trouble. - assert(node_offset >= 0); - in_read_offsets[i] = read_offset; - for (int j = 0; j < mapping.edit_size(); ++j) { - const Edit& edit = mapping.edit(j); - const Edit* next_edit = NULL; - if (j + 1 < mapping.edit_size()) { - next_edit = &mapping.edit(j + 1); - } else if (i + 1 < path.mapping_size() && path.mapping(i + 1).edit_size() > 0) { - next_edit = &path.mapping(i + 1).edit(0); - } - // process all pileups in edit. - // update the offsets as we go - compute_from_edit(*pileup, node_offset, read_offset, *node, - alignment, mapping, edit, next_edit, mismatch_counts, - last_match, last_del, open_del); - } - out_read_offsets[i] = read_offset - 1; - - if (rank <= 0 || rank >= ranks.size() || ranks[rank] != -1) { - cerr << "Error determining rank of mapping " << i << " in path " << path.name() << ": " - << pb2json(mapping) << endl; - } - else { - ranks[rank] = i; - } - } else { - // node not in graph. that's okay, we do nothing but update the read_offset to - // not trigger assert at end of this function - for (int j = 0; j < mapping.edit_size(); ++j) { - read_offset += mapping.edit(j).to_length(); - } - ranks[rank] = -1; - } - } - // loop again over all the edges crossed by the mapping alignment, using - // the offsets and ranking information we got in the first pass - for (int i = 2; i < ranks.size(); ++i) { - int rank1_idx = ranks[i-1]; - int rank2_idx = ranks[i]; - if ((rank1_idx > 0 || rank2_idx > 0) && (rank1_idx >= 0 && rank2_idx >= 0)) { - auto& m1 = path.mapping(rank1_idx); - auto& m2 = path.mapping(rank2_idx); - // only count edges bookended by matches (unless _strict_edge_support is false) - size_t m1eds = m1.edit_size(); - -#ifdef debug - cerr << "Alignment crosses edge " << pb2json(m1) << "->" << pb2json(m2) << endl; -#endif - - if (!_strict_edge_support || - ((m1eds == 0 || m1.edit(m1eds - 1).from_length() == m1.edit(m1eds - 1).to_length()) && - (m2.edit_size() == 0 || m2.edit(0).from_length() == m2.edit(0).to_length()))) { - auto s1 = NodeSide(m1.position().node_id(), (m1.position().is_reverse() ? false : true)); - auto s2 = NodeSide(m2.position().node_id(), (m2.position().is_reverse() ? true : false)); - // no quality gives a free pass from quality filter - char edge_qual = 127; - if (!alignment.quality().empty()) { - char from_qual = alignment.quality()[out_read_offsets[rank1_idx]]; - char to_qual = alignment.quality()[in_read_offsets[rank2_idx]]; - edge_qual = combined_quality(min(from_qual, to_qual), alignment.mapping_quality()); - } - if (edge_qual >= _min_quality) { -#ifdef debug - cerr << "\tMeets quality threshold" << endl; -#endif - EdgePileup* edge_pileup = get_create_edge_pileup(pair(s1, s2)); - if (edge_pileup->num_reads() < _max_depth) { - edge_pileup->set_num_reads(edge_pileup->num_reads() + 1); - - if (m1.position().node_id() < m2.position().node_id()) { - // We are going along the forward strand of the edge, from low ID to high ID - // TODO: What do we do if augmentation renumbers the nodes differently? -#ifdef debug - cerr << "\tTreat as a forward strand edge support between different nodes" << endl; -#endif - edge_pileup->set_num_forward_reads(edge_pileup->num_forward_reads() + 1); - } else if (m1.position().node_id() == m2.position().node_id() && !m1.position().is_reverse()) { - // This is a self loop and we are reading forward - // into it. For self loops around we call it the - // forward strand. For reversing self loops, on the - // end they are always the forward strand, and on - // the start they are never the forward strand. - // TODO: what if augmentation dvides a node that had a self loop and changes our numbering? -#ifdef debug - cerr << "\tTreat as a forward strand edge support on a self loop" << endl; -#endif - edge_pileup->set_num_forward_reads(edge_pileup->num_forward_reads() + 1); - } else { -#ifdef debug - cerr << "\tTreat as a reverse strand edge support" << endl; -#endif - } - - // TODO: Should we determine forward/reverse based on - // if we agree with the from/to of the edge as written - // in the graph instead? - - if (!alignment.quality().empty()) { - *edge_pileup->mutable_qualities() += edge_qual; - } - } - } - } - } - } - - assert(alignment.sequence().empty() || - alignment.path().mapping_size() == 0 || - read_offset == alignment.sequence().length()); - -} - -void Pileups::compute_from_edit(NodePileup& pileup, int64_t& node_offset, - int64_t& read_offset, - const Node& node, const Alignment& alignment, - const Mapping& mapping, const Edit& edit, - const Edit* next_edit, - const vector& mismatch_counts, - pair& last_match, - pair& last_del, - pair& open_del) { - string seq = edit.sequence(); - // is the mapping reversed wrt read sequence? use for iterating - bool map_reverse = mapping.position().is_reverse(); - - // ***** MATCH ***** - if (edit.from_length() == edit.to_length()) { - assert (edit.from_length() > 0); - make_match(seq, edit.from_length(), map_reverse); - assert(seq.length() == edit.from_length()); - int64_t delta = map_reverse ? -1 : 1; - for (int64_t i = 0; i < edit.from_length(); ++i) { - if (pass_filter(alignment, read_offset, 1, mismatch_counts)) { - // Don't go outside the node - if (node_offset >= _graph->get_node(pileup.node_id())->sequence().size()) { - cerr << "error [vg::Pileups] node_offset of " << node_offset << " on " << pileup.node_id() << " is too big for node of size " << _graph->get_node(pileup.node_id())->sequence().size() << endl; - cerr << "Alignment: " << pb2json(alignment) << endl; - throw runtime_error("Node offset too large in alignment"); - } - BasePileup* base_pileup = get_create_base_pileup(pileup, node_offset); - if (base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (base_pileup->num_bases() == 0) { - base_pileup->set_ref_base(node.sequence()[node_offset]); - } else { - assert(base_pileup->ref_base() == node.sequence()[node_offset]); - } - // add base to bases field (converting to ,. if match) - char base = seq[i]; - *base_pileup->mutable_bases() += base; - // add quality if there - if (!alignment.quality().empty()) { - *base_pileup->mutable_qualities() += min((int32_t)alignment.quality()[read_offset], (int32_t)alignment.mapping_quality()); - } - // pileup size increases by 1 - base_pileup->set_num_bases(base_pileup->num_bases() + 1); - } - // close off any open deletion - if (open_del.first != NULL) { - string del_seq; - make_delete(del_seq, map_reverse, last_match, mapping, node_offset); - int64_t dp_node_id; - int64_t dp_node_offset; - // store in canonical position - if (make_pair(make_pair(last_del.first->position().node_id(), last_del.second), - last_del.first->position().is_reverse()) < - make_pair(make_pair(open_del.first->position().node_id(), open_del.second), - open_del.first->position().is_reverse())) { - dp_node_id = last_del.first->position().node_id(); - dp_node_offset = last_del.second; - } else { - dp_node_id = open_del.first->position().node_id(); - dp_node_offset = open_del.second; - } - Node* dp_node = _graph->get_node(dp_node_id); - // Don't go outside the node - assert(dp_node_offset < dp_node->sequence().size()); - NodePileup* dp_node_pileup = get_create_node_pileup(dp_node); - BasePileup* dp_base_pileup = get_create_base_pileup(*dp_node_pileup, dp_node_offset); - if (dp_base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (dp_base_pileup->num_bases() == 0) { - dp_base_pileup->set_ref_base(dp_node->sequence()[dp_node_offset]); - } else { - assert(dp_base_pileup->ref_base() == dp_node->sequence()[dp_node_offset]); - } - *dp_base_pileup->mutable_bases() += del_seq; - if (!alignment.quality().empty()) { - // we only use quality of one endpoint here. should average - *dp_base_pileup->mutable_qualities() += combined_quality(alignment.quality()[read_offset], - alignment.mapping_quality()); - } - dp_base_pileup->set_num_bases(dp_base_pileup->num_bases() + 1); - } - open_del = make_pair((Mapping*)NULL, -1); - last_del = make_pair((Mapping*)NULL, -1); - } - - last_match = make_pair(&mapping, node_offset); - } - // move right along read, and left/right depending on strand on reference - node_offset += delta; - ++read_offset; - } - } - // ***** INSERT ***** - else if (edit.from_length() < edit.to_length()) { - if (pass_filter(alignment, read_offset, edit.to_length(), mismatch_counts)) { - make_insert(seq, map_reverse); - assert(edit.from_length() == 0); - // we define insert (like sam) as insertion between current and next - // position (on forward node coordinates). this means an insertion before - // offset 0 is invalid! - int64_t insert_offset = map_reverse ? node_offset : node_offset - 1; - if (insert_offset >= 0 && - // make sure we have a match before and after the insert to take it seriously - next_edit != NULL && last_match.first != NULL && - next_edit->from_length() == next_edit->to_length()) { - // Don't go outside the node - assert(insert_offset < _graph->get_node(pileup.node_id())->sequence().size()); - BasePileup* base_pileup = get_create_base_pileup(pileup, insert_offset); - if (base_pileup->num_bases() < _max_depth) { - // reference_base if empty - if (base_pileup->num_bases() == 0) { - base_pileup->set_ref_base(node.sequence()[insert_offset]); - } else { - assert(base_pileup->ref_base() == node.sequence()[insert_offset]); - } - // add insertion string to bases field - base_pileup->mutable_bases()->append(seq); - if (!alignment.quality().empty()) { - *base_pileup->mutable_qualities() += combined_quality(alignment.quality()[read_offset], - alignment.mapping_quality()); - } - // pileup size increases by 1 - base_pileup->set_num_bases(base_pileup->num_bases() + 1); - } - } - else { - // need to check with aligner to make sure this doesn't happen, ie - // inserts would hang off the end of previous node instead of start - // of this node - /* - stringstream ss; - ss << "Warning: pileup does not support insertions before 0th base in node." - << " Offending edit: " << pb2json(edit) << endl; - #pragma omp critical(cerr) - cerr << ss.str(); - */ - } - } - // move right along read (and stay put on reference) - read_offset += edit.to_length(); - } - // ***** DELETE ***** - else { - if (pass_filter(alignment, read_offset, 1, mismatch_counts)) { - assert(edit.to_length() == 0); - assert(edit.sequence().empty()); - - // deltion will get written in the "Match" section - // note: deletions will only get written if there's a match on either side - // so deletions at beginning/end of read ignored in pileup - if (open_del.first == NULL && last_match.first != NULL) { - open_del = make_pair(&mapping, node_offset); - } - // open_del : first base deleted by deleltion - // last_del : most recent base deleted by deletion - // last_match : most recent base in a match - // (most recent is in order we are scanning here) - - // a deletion will be an edge between two matches. - // but in the pileup, it will be stored in either open_del or last_del - // (which ever has lower coordinate). - } - int64_t delta = map_reverse ? -edit.from_length() : edit.from_length(); - - // stay put on read, move left/right depending on strand on reference - node_offset += delta; - - last_del = make_pair(&mapping, map_reverse ? node_offset + 1 : node_offset - 1); - } -} - -void Pileups::count_mismatches(VG& graph, const Path& path, - vector& mismatches, - bool skipIndels) -{ - mismatches.clear(); - int64_t read_offset = 0; - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - if (graph.has_node(mapping.position().node_id())) { - const Node* node = graph.get_node(mapping.position().node_id()); - int64_t node_offset = mapping.position().offset(); - // utilize forward-relative node offset (old way), which - // is not consistent with current protobuf. conversion here. - if (mapping.position().is_reverse()) { - node_offset = node->sequence().length() - 1 - node_offset; - } - - for (int j = 0; j < mapping.edit_size(); ++j) { - const Edit& edit = mapping.edit(j); - // process all pileups in edit. - // update the offsets as we go - string seq = edit.sequence(); - bool is_reverse = mapping.position().is_reverse(); - if (is_reverse) { - seq = reverse_complement(seq); - } - - // ***** MATCH ***** - if (edit.from_length() == edit.to_length()) { - int64_t delta = is_reverse ? -1 : 1; - for (int64_t i = 0; i < edit.from_length(); ++i) { - if (!edit.sequence().empty() && - !base_equal(seq[i], node->sequence()[node_offset], false)) { - mismatches.push_back(1); - } - else { - mismatches.push_back(0); - } - // move right along read, and left/right depending on strand on reference - node_offset += delta; - ++read_offset; - } - } - // ***** INSERT ***** - else if (edit.from_length() < edit.to_length()) { - if (skipIndels == false) { - mismatches.push_back(1); - for (int x = 1; x < edit.to_length(); ++x) { - mismatches.push_back(0); - } - } - // move right along read (and stay put on reference) - read_offset += edit.to_length(); - } - // ***** DELETE ***** - else { - if (skipIndels == false) { - // since we're working in read coordinates, we count - // a single mismatch right before the delete. - if (mismatches.size() > 0) { - mismatches[mismatches.size() - 1] = 1; - } - } - int64_t delta = is_reverse ? -edit.from_length() : edit.from_length(); - // stay put on read, move left/right depending on strand on reference - node_offset += delta; - } - } - } else { - // node not in graph: count 0 mismatches for each absent position - for (int j = 0; j < mapping.edit_size(); ++j) { - read_offset += mapping.edit(j).to_length(); - for (int k = 0; k < mapping.edit(j).to_length(); ++k) { - mismatches.push_back(0); - } - } - } - } - assert(skipIndels || read_offset == mismatches.size()); - // too lazy to do full count inline. sum up here - int count = 0; - for (int i = 0; i < mismatches.size(); ++i) { - count += mismatches[i]; - mismatches[i] = count; - } -} - -bool Pileups::pass_filter(const Alignment& alignment, int64_t read_offset, - int64_t length, const vector& mismatches) const -{ - bool min_quality_fail = false; - bool max_mismatch_fail = false; - // loop is becaues insertions are considered as one block - // in this case entire block fails if single base fails - for (int64_t cur_offset = read_offset; cur_offset < read_offset + length; ++cur_offset) { - if (!alignment.quality().empty()) { - if (combined_quality(alignment.quality()[cur_offset], alignment.mapping_quality()) < _min_quality) { - min_quality_fail = true; - break; - } - } - if (_window_size > 0) { - // counts in left window - int64_t left_point = max((int64_t)0, cur_offset - _window_size / 2 - 1); - int64_t right_point = max((int64_t)0, cur_offset - 1); - int64_t count = mismatches[right_point] - mismatches[left_point]; - // coutns in right window - left_point = cur_offset; - right_point = min(cur_offset + _window_size / 2, (int64_t)mismatches.size() - 1); - count += mismatches[right_point] - mismatches[left_point]; - if (count > _max_mismatches) { - max_mismatch_fail = true; - break; - } - } - } - if (max_mismatch_fail) { - _max_mismatch_count += length; - } - if (min_quality_fail) { - _min_quality_count += length; - } - _bases_count += length; - return !max_mismatch_fail && !min_quality_fail; -} - -Pileups& Pileups::merge(Pileups& other) { - for (auto& p : other._node_pileups) { - insert_node_pileup(p.second); - } - other._node_pileups.clear(); - for (auto& p : other._edge_pileups) { - insert_edge_pileup(p.second); - } - other._edge_pileups.clear(); - _min_quality_count += other._min_quality_count; - other._min_quality_count = 0; - _max_mismatch_count += other._max_mismatch_count; - other._max_mismatch_count = 0; - _bases_count += other._bases_count; - other._bases_count = 0; - return *this; -} - -BasePileup& Pileups::merge_base_pileups(BasePileup& p1, BasePileup& p2) { - assert(p1.num_bases() == 0 || p2.num_bases() == 0 || - p1.ref_base() == p2.ref_base()); - if (p1.num_bases() == 0) { - p1.set_ref_base(p2.ref_base()); - } - int merge_size = min(p2.num_bases(), _max_depth - p1.num_bases()); - p1.set_num_bases(p1.num_bases() + merge_size); - if (merge_size == p2.num_bases()) { - p1.mutable_bases()->append(p2.bases()); - p1.mutable_qualities()->append(p2.qualities()); - } else if (merge_size > 0) { - vector > offsets; - parse_base_offsets(p2, offsets); - int merge_length = offsets[merge_size].first; - p1.mutable_bases()->append(p2.bases().substr(0, merge_length)); - if (!p2.qualities().empty()) { - p1.mutable_qualities()->append(p2.qualities().substr(0, merge_size)); - } - } - p2.set_num_bases(0); - p2.clear_bases(); - p2.clear_qualities(); - return p1; -} - -NodePileup& Pileups::merge_node_pileups(NodePileup& p1, NodePileup& p2) { - assert(p1.node_id() == p2.node_id()); - // Don't go outside the node - assert(p1.base_pileup_size() <= _graph->get_node(p1.node_id())->sequence().size()); - assert(p2.base_pileup_size() <= _graph->get_node(p2.node_id())->sequence().size()); - for (int i = 0; i < p2.base_pileup_size(); ++i) { - BasePileup* bp1 = get_create_base_pileup(p1, i); - BasePileup* bp2 = get_base_pileup(p2, i); - merge_base_pileups(*bp1, *bp2); - } - p2.clear_base_pileup(); - return p1; -} - -EdgePileup& Pileups::merge_edge_pileups(EdgePileup& p1, EdgePileup& p2) { - assert(p1.edge().from() == p2.edge().from()); - assert(p1.edge().to() == p2.edge().to()); - assert(p1.edge().from_start() == p2.edge().from_start()); - assert(p1.edge().to_end() == p2.edge().to_end()); - int merge_size = min(p2.num_reads(), _max_depth - p1.num_reads()); - p1.set_num_reads(p1.num_reads() + merge_size); - int forward_merge_size = p2.num_forward_reads() * - ((double)merge_size / (double)p2.num_reads()); - p1.set_num_forward_reads(p1.num_forward_reads() + forward_merge_size); - if (merge_size == p2.num_reads()) { - p1.mutable_qualities()->append(p2.qualities()); - } else if (!p2.qualities().empty()) { - p1.mutable_qualities()->append(p2.qualities().substr(0, merge_size)); - } - p2.set_num_reads(0); - p2.set_num_forward_reads(0); - p2.clear_qualities(); - return p1; -} - -void Pileups::parse_base_offsets(const BasePileup& bp, - vector >& offsets) { - offsets.clear(); - - const string& quals = bp.qualities(); - const string& bases = bp.bases(); - char ref_base = ::toupper(bp.ref_base()); - // we can use i to index the quality for the ith row of pileup, but - // need base_offset to get position of appropriate token in bases string - int64_t base_offset = 0; - for (int i = 0; i < bp.num_bases(); ++i) { - // insert - if (bases[base_offset] == '+') { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - int64_t lf = base_offset + 1; - int64_t rf = lf; - while (rf < bases.length() && bases[rf] >= '0' && bases[rf] <= '9') { - ++rf; - } - stringstream ss(bases.substr(lf, rf - lf + 1)); - int64_t indel_len; - ss >> indel_len; - // ex: +5aaaaa. rf = lf = 1. indel_len = 5 -> increment 2+0+5=7 - base_offset += 1 + rf - lf + indel_len; - // delete - } else if (bases[base_offset] == '-') { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - int64_t lf = base_offset + 1; - // eat up six semicolons - for (int64_t sc_count = 0; sc_count < 6; ++lf) { - if (bases[lf] == ';') { - ++sc_count; - } - } - // and last number - for (; bases[lf] >= '0' && bases[lf] <= '9'; ++lf); - base_offset = lf; - } - // match / snp - else { - offsets.push_back(make_pair(base_offset, i < quals.length() ? i : -1)); - ++base_offset; - } - } - assert(base_offset == bases.length()); -} - -// transform case of every character in string -void Pileups::casify(string& seq, bool is_reverse) { - if (is_reverse) { - transform(seq.begin(), seq.end(), seq.begin(), ::tolower); - } else { - transform(seq.begin(), seq.end(), seq.begin(), ::toupper); - } -} - -// make the sam pileup style token -void Pileups::make_match(string& seq, int64_t from_length, bool is_reverse) { - if (seq.length() == 0) { - seq = string(from_length, is_reverse ? ',' : '.'); - } else { - casify(seq, is_reverse); - } -} - -void Pileups::make_insert(string& seq, bool is_reverse) { - casify(seq, is_reverse); - stringstream ss; - ss << "+" << seq.length() << seq; - seq = ss.str(); -} - -void Pileups::make_delete(string& seq, bool is_reverse, const pair& last_match, - const Mapping& mapping, int64_t node_offset){ - int64_t from_id = last_match.first->position().node_id(); - int64_t from_offset = last_match.second; - bool from_start = last_match.first->position().is_reverse(); - int64_t to_id = mapping.position().node_id(); - int64_t to_offset = node_offset; - bool to_end = mapping.position().is_reverse(); - - // canonical order - if (make_pair(make_pair(from_id, from_offset), from_start) > - make_pair(make_pair(to_id, to_offset), to_end)) { - std::swap(from_id, to_id); - std::swap(from_offset, to_offset); - std::swap(from_start, to_end); - from_start = !from_start; - to_end = !to_end; - } - - make_delete(seq, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); -} - -void Pileups::make_delete(string& seq, bool is_reverse, - int64_t from_id, int64_t from_offset, bool from_start, - int64_t to_id, int64_t to_offset, bool to_end) { - // format : -is_reverse;from_id;from_offset;from_start;to_id;do_offset;to_end - stringstream ss; - ss << "-" << is_reverse << ";" << from_id << ";" << from_offset << ";" << from_start << ";" - << to_id << ";" << to_offset << ";" << to_end; - seq = ss.str(); -} - -void Pileups::parse_insert(const string& tok, int64_t& len, string& seq, bool& is_reverse) { - assert(tok[0] == '+'); - int64_t i = 1; - for (; tok[i] >= '0' && tok[i] <= '9'; ++i); - stringstream ss; - ss << tok.substr(1, i - 1); - ss >> len; - seq = tok.substr(i, tok.length() - i); - is_reverse = ::islower(seq[0]); -} - -void Pileups::parse_delete(const string& tok, bool& is_reverse, - int64_t& from_id, int64_t& from_offset, bool& from_start, - int64_t& to_id, int64_t& to_offset, bool& to_end) { - assert(tok[0] == '-'); - vector toks; - regex sc_re(";"); - std::copy(sregex_token_iterator(tok.begin(), tok.end(), sc_re, -1), - sregex_token_iterator(), back_inserter(toks)); - - assert(toks.size() == 7); - is_reverse = std::stoi(toks[0]) != 0; - - from_id = std::stoi(toks[1]); - from_offset = std::stoi(toks[2]); - from_start = std::stoi(toks[3]) != 0; - - to_id = std::stoi(toks[4]); - to_offset = std::stoi(toks[5]); - to_end = std::stoi(toks[6]) != 0; -} - -bool Pileups::base_equal(char c1, char c2, bool is_reverse) { - char t1 = ::toupper(c1); - char t2 = ::toupper(c2); - return is_reverse ? t1 == reverse_complement(t2) : t1 == t2; -} - -char Pileups::extract_match(const BasePileup& bp, int64_t offset) { - char v = bp.bases()[offset]; - assert(v != '+' && v != '-'); - if (v == ',' || v == '.') { - return ::toupper(bp.ref_base()); - } else if (::islower(v)) { - return reverse_complement(::toupper(v)); - } - return v; -} - -// get arbitrary value from offset on forward strand -string Pileups::extract(const BasePileup& bp, int64_t offset) { - const string& bases = bp.bases(); - if (bases[offset] != '+' && bases[offset] != '-') { - return string(1, extract_match(bp, offset)); - } - else if (bases[offset] == '+') { - string len_str; - for (int64_t i = offset + 1; bases[i] >= '0' && bases[i] <= '9'; ++i) { - len_str += bases[i]; - } - int64_t len = atoi(len_str.c_str()); - // forward strand, return as is - if (::isupper(bases[offset + 1 + len_str.length()])) { - return bases.substr(offset, 1 + len_str.length() + len); - } - // reverse strand, flip the dna bases part and return upper case - else { - string dna = bases.substr(offset + 1 + len_str.length(), len); - casify(dna, false); - return string(1, bases[offset]) + len_str + reverse_complement(dna); - } - } - else { - assert(bases[offset] == '-'); - // todo : consolidate deletion parsing code better than this - int64_t sc = 0; - int64_t i = offset; - for (; sc < 6; ++i) { - if (bases[i] == ';') { - ++sc; - } - } - return bases.substr(offset, i - offset + 1); - } -} - -} diff --git a/src/pileup.hpp b/src/pileup.hpp deleted file mode 100644 index d2954319592..00000000000 --- a/src/pileup.hpp +++ /dev/null @@ -1,289 +0,0 @@ -#ifndef VG_PILEUP_HPP_INCLUDED -#define VG_PILEUP_HPP_INCLUDED - -#include -#include -#include -#include -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" - -namespace vg { - -using namespace std; - -/// This is a collection of protobuf NodePileup records that are indexed -/// on their position, as well as EdgePileup records. -/// Pileups can be merged and streamed, and computed -/// from Alignments. The pileup records themselves are essentially -/// protobuf versions of lines in Samtools pileup format, with deletions -/// represented using a graph-based notation. -class Pileups { -public: - - Pileups(VG* graph, int min_quality = 0, int max_mismatches = 1, int window_size = 0, - int max_depth = 1000, bool use_mapq = false, bool strict_edge_support = false) : - _graph(graph), - _min_quality(min_quality), - _max_mismatches(max_mismatches), - _window_size(window_size), - _max_depth(max_depth), - _min_quality_count(0), - _max_mismatch_count(0), - _bases_count(0), - _use_mapq(use_mapq), - _strict_edge_support(strict_edge_support) -{} - - /// copy constructor - Pileups(const Pileups& other) { - if (this != &other) { - _graph = other._graph; - for (auto& p : other._node_pileups) { - insert_node_pileup(new NodePileup(*p.second)); - } - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - _strict_edge_support = other._strict_edge_support; - } - } - - /// move constructor - Pileups(Pileups&& other) noexcept { - _graph = other._graph; - _node_pileups = other._node_pileups; - other._node_pileups.clear(); - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - _strict_edge_support = other._strict_edge_support; - } - - /// copy assignment operator - Pileups& operator=(const Pileups& other) { - Pileups tmp(other); - *this = move(tmp); - return *this; - } - - /// move assignment operator - Pileups& operator=(Pileups&& other) noexcept { - _graph = other._graph; - swap(_node_pileups, other._node_pileups); - other._node_pileups.clear(); - _min_quality = other._min_quality; - _max_mismatches = other._max_mismatches; - _window_size = other._window_size; - _max_depth = other._max_depth; - _min_quality_count = other._min_quality_count; - _max_mismatch_count = other._max_mismatch_count; - _bases_count = other._bases_count; - _use_mapq = other._use_mapq; - _strict_edge_support = other._strict_edge_support; - return *this; - } - - /// delete contents of table - ~Pileups() { - clear(); - } - void clear(); - - // XXXX these should be hash_map but it won't compile unless they're explicitly defined - typedef spp::sparse_hash_map > NodePileupHash; - typedef spp::sparse_hash_map, EdgePileup*, wang_hash > > EdgePileupHash; - - VG* _graph; - - /// This maps from Position to Pileup. - NodePileupHash _node_pileups; - EdgePileupHash _edge_pileups; - - /// Ignore bases with quality less than this - int _min_quality; - /// max mismatches within window_size - int _max_mismatches; - /// number of bases to scan in each direction for mismatches - int _window_size; - /// prevent giant protobufs - int _max_depth; - /// toggle whether we incorporate Alignment.mapping_quality - bool _use_mapq; - /// toggle whether we only count edges if they are not book-ended by matches in the read - bool _strict_edge_support; - /// Keep count of bases filtered by quality - mutable uint64_t _min_quality_count; - /// keep count of bases filtered by mismatches - mutable uint64_t _max_mismatch_count; - /// overall count for perspective on above - mutable uint64_t _bases_count; - - /// write to JSON - void to_json(ostream& out); - /// read from protobuf - void load(istream& in); - /// write to protobuf, with EOF marker - void write(ostream& out, size_t buffer_size = 5); - - /// apply function to each pileup in table - void for_each_node_pileup(const function& lambda); - - /// search hash table for node id - NodePileup* get_node_pileup(int64_t node_id) { - auto p = _node_pileups.find(node_id); - return p != _node_pileups.end() ? p->second : NULL; - } - - /// get a pileup. if it's null, create a new one and insert it. - NodePileup* get_create_node_pileup(const Node* node) { - NodePileup* p = get_node_pileup(node->id()); - if (p == NULL) { - p = new NodePileup(); - p->set_node_id(node->id()); - for (int i = 0; i < node->sequence().length(); ++i) { - BasePileup* b = p->add_base_pileup(); - b->set_num_bases(0); - b->set_ref_base((int)node->sequence()[i]); - } - _node_pileups[node->id()] = p; - } - return p; - } - - void for_each_edge_pileup(const function& lambda); - - /// search hash table for edge id - EdgePileup* get_edge_pileup(pair sides); - - /// get a pileup. if it's null, create a new one and insert it. - EdgePileup* get_create_edge_pileup(pair sides); - - void extend(Pileup& pileup); - - /// insert a pileup into the table. it will be deleted by ~Pileups()!!! - /// return true if new pileup inserted, false if merged into existing one - bool insert_node_pileup(NodePileup* pileup); - bool insert_edge_pileup(EdgePileup* edge_pileup); - - /// create / update all pileups from a single alignment - void compute_from_alignment(Alignment& alignment); - - /// create / update all pileups from an edit (called by above). - /// query stores the current position (and nothing else). - void compute_from_edit(NodePileup& pileup, int64_t& node_offset, int64_t& read_offset, - const Node& node, const Alignment& alignment, - const Mapping& mapping, const Edit& edit, - const Edit* next_edit, - const vector& mismatch_counts, - pair& last_match, - pair& last_del, - pair& open_del); - - /// do one pass to count all mismatches in read, so we can do - /// mismatch filter efficiently in 2nd path. - /// mismatches[i] stores number of mismatches in range (0, i) - static void count_mismatches(VG& graph, const Path& path, vector& mismatches, - bool skipIndels = false); - - /// check base quality as well as miss match filter - bool pass_filter(const Alignment& alignment, int64_t read_offset, - int64_t length, - const vector& mismatches) const; - - /// move all entries in other object into this one. - /// if two positions collide, they are merged. - /// other will be left empty. this is returned - Pileups& merge(Pileups& other); - - /// merge p2 into p1 and return 1. p2 is left an empty husk - BasePileup& merge_base_pileups(BasePileup& p1, BasePileup& p2); - - /// merge p2 into p1 and return 1. p2 is lef an empty husk - NodePileup& merge_node_pileups(NodePileup& p1, NodePileup& p2); - - /// merge p2 into p1 and return 1. p2 is lef an empty husk - EdgePileup& merge_edge_pileups(EdgePileup& p1, EdgePileup& p2); - - /// create combine map quality (optionally) with base quality - char combined_quality(char base_quality, int map_quality) const { - if (!_use_mapq) { - return base_quality; - } else { - // assume independence: P[Correct] = P[Correct Base] * P[Correct Map] - // --> P[Error] = 1 - (1 - P[Base Error]) * (1 - P[Map Error]) - // (using same code as gentoyper:) - double p_err = logprob_invert(logprob_invert(phred_to_logprob((int)base_quality)) + - logprob_invert(phred_to_logprob(map_quality))); - int qual = logprob_to_phred(p_err); - return (char)min(qual, (int)numeric_limits::max()); - } - } - - /// get ith BasePileup record - static BasePileup* get_base_pileup(NodePileup& np, int64_t offset) { - assert(offset < np.base_pileup_size() && offset >= 0); - return np.mutable_base_pileup(offset); - } - static const BasePileup* get_base_pileup(const NodePileup& np, int64_t offset) { - assert(offset < np.base_pileup_size() && offset >= 0); - return &np.base_pileup(offset); - } - - /// get ith BasePileup record, create if doesn't exist - static BasePileup* get_create_base_pileup(NodePileup& np, int64_t offset) { - for (int64_t i = np.base_pileup_size(); i <= offset; ++i) { - np.add_base_pileup(); - } - return get_base_pileup(np, offset); - } - - /// the bases string in BasePileup doesn't allow random access. This function - /// will parse out all the offsets of snps, insertions, and deletions - /// into one array, each offset is a pair of indexes in the bases and qualities arrays - static void parse_base_offsets(const BasePileup& bp, - vector >& offsets); - - /// transform case of every character in string - static void casify(string& seq, bool is_reverse); - - /// make the sam pileup style token - static void make_match(string& seq, int64_t from_length, bool is_reverse); - static void make_insert(string& seq, bool is_reverse); - static void make_delete(string& seq, bool is_reverse, - const pair& last_match, - const Mapping& mapping, int64_t node_offset); - static void make_delete(string& seq, bool is_reverse, - int64_t from_id, int64_t from_offset, bool from_start, - int64_t to_id, int64_t to_offset, bool to_end); - - static void parse_insert(const string& tok, int64_t& len, string& seq, bool& is_reverse); - static void parse_delete(const string& tok, bool& is_reverse, - int64_t& from_id, int64_t& from_offset, bool& from_start, - int64_t& to_id, int64_t& to_offset, bool& to_end); - - static bool base_equal(char c1, char c2, bool is_reverse); - - /// get a pileup value on forward strand - static char extract_match(const BasePileup& bp, int64_t offset); - - /// get arbitrary value from offset on forward strand - static string extract(const BasePileup& bp, int64_t offset); -}; - - - -} - -#endif diff --git a/src/pileup_augmenter.cpp b/src/pileup_augmenter.cpp deleted file mode 100644 index 2bf583e7675..00000000000 --- a/src/pileup_augmenter.cpp +++ /dev/null @@ -1,1185 +0,0 @@ -#include -#include -#include "json2pb.h" -#include "pileup_augmenter.hpp" -#include - -using namespace std; - -namespace vg { - -const double PileupAugmenter::Log_zero = (double)-1e100; - -const char PileupAugmenter::Default_default_quality = 30; -const int PileupAugmenter::Default_min_aug_support = 2; - -PileupAugmenter::PileupAugmenter(VG* graph, - int default_quality, - int min_aug_support): - _graph(graph), - _default_quality(default_quality), - _min_aug_support(min_aug_support) { - assert(_min_aug_support > 0); - _max_id = _graph->max_node_id(); - _node_divider._max_id = &_max_id; - _augmented_graph.base_graph = graph; -} - -// delete contents of table -PileupAugmenter::~PileupAugmenter() { - clear(); -} - -void PileupAugmenter::clear() { - _node_calls.clear(); - _node_supports.clear(); - _insert_calls.clear(); - _insert_supports.clear(); - _augmented_graph.clear(); - _node_divider.clear(); - _visited_nodes.clear(); - _called_edges.clear(); - _augmented_edges.clear(); - _inserted_nodes.clear(); -} - -void PileupAugmenter::write_augmented_graph(ostream& out, bool json) { - if (json) { - _augmented_graph.graph.paths.to_graph(_augmented_graph.graph.graph); - out << pb2json(_augmented_graph.graph.graph); - } else { - _augmented_graph.graph.serialize_to_ostream(out); - } -} - -void PileupAugmenter::call_node_pileup(const NodePileup& pileup) { - - _node = _graph->get_node(pileup.node_id()); - assert(_node != NULL); - assert(_node->sequence().length() == pileup.base_pileup_size()); - - _node_calls.clear(); - _insert_calls.clear(); - string def_char = "-"; - _node_calls.assign(_node->sequence().length(), Genotype(def_char, def_char)); - _insert_calls.assign(_node->sequence().length(), Genotype(def_char, def_char)); - _node_supports.clear(); - _node_supports.assign(_node->sequence().length(), make_pair( - StrandSupport(), StrandSupport())); - _insert_supports.clear(); - _insert_supports.assign(_node->sequence().length(), make_pair( - StrandSupport(), StrandSupport())); - - // process each base in pileup individually - #pragma omp parallel for - for (int i = 0; i < pileup.base_pileup_size(); ++i) { - int num_inserts = 0; - for (auto b : pileup.base_pileup(i).bases()) { - if (b == '+') { - ++num_inserts; - } - } - int pileup_depth = max(num_inserts, pileup.base_pileup(i).num_bases() - num_inserts); - if (pileup_depth >= 1) { - call_base_pileup(pileup, i, false); - call_base_pileup(pileup, i, true); - } - } - - // add nodes and edges created when making calls to the output graph - // (_side_map gets updated) - create_node_calls(pileup); - - _visited_nodes.insert(_node->id()); -} - -void PileupAugmenter::call_edge_pileup(const EdgePileup& pileup) { - if (pileup.num_reads() >= 1) { - - double qual_sum = 0; - - for (int i = 0; i < pileup.num_reads(); ++i) { - char qual = !pileup.qualities().empty() ? pileup.qualities().at(i) : _default_quality; - qual_sum += (double)qual; - } - - Edge edge = pileup.edge(); // gcc not happy about passing directly - _called_edges[NodeSide::pair_from_edge(edge)] = StrandSupport( - pileup.num_forward_reads(), - pileup.num_reads() - pileup.num_forward_reads(), - qual_sum); - } -} - -void PileupAugmenter::update_augmented_graph() { - - // Add nodes we don't think necessarily exist. - function add_node = [&](Node* node) { - if (_visited_nodes.find(node->id()) == _visited_nodes.end()) { - Node* call_node = _augmented_graph.graph.create_node(node->sequence(), node->id()); - _node_divider.add_fragment(node, 0, call_node, NodeDivider::EntryCat::Ref, - vector()); - } - }; - _graph->for_each_node(add_node); - - // map every edge in the original graph to equivalent sides - // in the call graph. if both sides exist, make an edge in the call graph - function map_edge = [&](Edge* edge) { - pair sides = NodeSide::pair_from_edge(edge); - // skip uncalled edges if not writing augmented graph - auto called_it = _called_edges.find(sides); - bool called = called_it != _called_edges.end(); - - StrandSupport support = called ? called_it->second : StrandSupport(); - assert(support.fs >= 0 && support.rs >= 0); - - Node* side1 = _graph->get_node(sides.first.node); - Node* side2 = _graph->get_node(sides.second.node); - // find up to two nodes matching side1 in the call graph - int from_offset = !sides.first.is_end ? 0 : side1->sequence().length() - 1; - int to_offset = sides.second.is_end ? side2->sequence().length() - 1 : 0; - char cat = called ? 'R' : 'U'; - create_augmented_edge(side1, from_offset, !sides.first.is_end, true, - side2, to_offset, !sides.second.is_end, true, cat, - support); - }; - - function process_augmented_edges = [&](bool pass1) { - for (auto& i : _augmented_edges) { - auto& sides = i.first; - char cat = i.second; - NodeOffSide os1 = sides.first; - Node* node1; - bool aug1; - if (_graph->has_node(os1.first.node)) { - node1 = _graph->get_node(os1.first.node); - aug1 = true; - } else { - // snp or isnert node -- need to get from call grpah - // note : that we should never break these as they aren't in - // the divider structure (will be caught down the road) - node1 = _augmented_graph.graph.get_node(os1.first.node); - aug1 = false; - } - int from_offset = os1.second; - bool left1 = !os1.first.is_end; - NodeOffSide os2 = sides.second; - Node* node2; - bool aug2; - if (_graph->has_node(os2.first.node)) { - node2 = _graph->get_node(os2.first.node); - aug2 = true; - } else { - // snp or insert node -- need to get from call graph - node2 = _augmented_graph.graph.get_node(os2.first.node); - aug2 = false; - } - // only need to pass support for here insertions, other cases handled elsewhere - StrandSupport support(-1, -1); - if (cat == 'I') { - auto ins_it = _inserted_nodes.find(os1.first.node); - if (ins_it != _inserted_nodes.end()) { - support = ins_it->second.sup; - } else { - ins_it = _inserted_nodes.find(os2.first.node); - assert(ins_it != _inserted_nodes.end()); - support = ins_it->second.sup; - } - assert(support.fs >= 0 && support.rs >= 0); - } - int to_offset = os2.second; - bool left2 = !os2.first.is_end; - // todo: clean this up - if (!pass1) { - create_augmented_edge(node1, from_offset, left1, aug1, node2, to_offset, left2, aug2, cat, support); - } else { - _node_divider.break_end(node1, &_augmented_graph.graph, from_offset, left1); - _node_divider.break_end(node2, &_augmented_graph.graph, to_offset, left2); - } - } - }; - - // two passes here is a hack to make sure break_end is called on all edge ends - // before processing any of them. - process_augmented_edges(true); - _graph->for_each_edge(map_edge); - process_augmented_edges(false); - - // Annotate all the nodes in the divider structure in the AugmentedGraph - annotate_augmented_nodes(); - // add on the inserted nodes - for (auto i : _inserted_nodes) { - auto& n = i.second; - annotate_augmented_node(n.node, 'I', n.sup, n.orig_id, n.orig_offset); - } - annotate_non_augmented_nodes(); -} - -void PileupAugmenter::map_path(const Path& path, list& aug_path, bool expect_edits) { - int64_t last_rank = -1; - int64_t last_call_rank = 0; - size_t running_len = 0; - size_t path_len = 0; - for (int i = 0; i < path.mapping_size(); ++i) { - const Mapping& mapping = path.mapping(i); - int64_t rank = mapping.rank() == 0 ? i+1 : mapping.rank(); - size_t len = mapping_from_length(mapping); - // force length so that insertions come back through map_node() - // (they will be corrected in apply_edits) - if (len == 0) { - assert(expect_edits); - len = 1; - } - - int64_t node_id = mapping.position().node_id(); - Node* node = _graph->get_node(node_id); - - int start = mapping.position().offset(); - if (mapping.position().is_reverse()) { - start = node->sequence().length() - 1 - start; - } - int end = mapping.position().is_reverse() ? start - len + 1 : start + len - 1; - - // this is a projection onto the augmented graph for the entire "from length" - // of the mapping - list aug_mappings = _node_divider.map_node(node_id, start, len, - mapping.position().is_reverse()); - - // undo insertion length hack above - if (len == 1 && mapping_from_length(mapping) == 0) { - assert(aug_mappings.size() == 1 && aug_mappings.front().edit_size() == 1); - aug_mappings.front().mutable_edit(0)->set_from_length(0); - } - - // now we apply edits post-hoc to the new mappings - if (expect_edits) { - apply_mapping_edits(mapping, aug_mappings); - } - else { - assert(mapping.edit_size() == 1 - && mapping_from_length(mapping) == mapping_to_length(mapping)); - } - - // add our new mappings to the augmented path - for (auto& cm : aug_mappings) { - running_len += mapping_from_length(cm); - cm.set_rank(++last_call_rank); - aug_path.push_back(mapping_t(cm)); - } - path_len += len; - last_rank = rank; - } - assert(running_len == path_len); - verify_path(path, aug_path); -} - -void PileupAugmenter::apply_mapping_edits(const Mapping& base_mapping, list& aug_mappings) { - // current place in aug_mappings list - auto aug_mapping_it = aug_mappings.begin(); - // from length of current mapping - size_t aug_mapping_len = mapping_from_length(*aug_mapping_it); - // amount of from length we've covered in the current augmented mapping - size_t aug_mapping_used = 0; - - // make new edits in a list then add to protobuf (simpler for now) - list aug_mapping_edits; - - // sanity checking - size_t total_aug_from_length = 0; - size_t total_aug_to_length = 0; - - for (size_t base_edit_idx = 0; base_edit_idx < base_mapping.edit_size(); ++base_edit_idx) { - const Edit& base_edit = base_mapping.edit(base_edit_idx); - const Edit* next_base_edit = base_edit_idx < base_mapping.edit_size() - 1 ? - &base_mapping.edit(base_edit_idx + 1) : NULL; - // walk along our current base edit, making as many new augmented edits as required - // the max(1, from_length) below is a hack for insertions - for (size_t be_covered = 0; be_covered < max(1, base_edit.from_length());) { - Edit aug_edit; - size_t edit_len = min((size_t)base_edit.from_length() - be_covered, - aug_mapping_len - aug_mapping_used); - - aug_edit.set_from_length(edit_len); - if (base_edit.to_length() == base_edit.from_length()) { - // match : we need to cut the to length - aug_edit.set_to_length(edit_len); - if (!base_edit.sequence().empty()) { - aug_edit.set_sequence(base_edit.sequence().substr(be_covered, edit_len)); - } - } else { - // indel : we can leave the to length - aug_edit.set_to_length(base_edit.to_length()); - // insertion sequence - if (!base_edit.sequence().empty()) { - aug_edit.set_sequence(base_edit.sequence()); - } - } - - total_aug_from_length += aug_edit.from_length(); - total_aug_to_length += aug_edit.to_length(); - - aug_mapping_edits.push_back(aug_edit); - - // advance in base edit - be_covered += max(edit_len, (size_t)1); - - // advance in augmented edit (and mapping if necessary) - aug_mapping_used += edit_len; - - assert(aug_mapping_used <= aug_mapping_len); - if (aug_mapping_used == aug_mapping_len && - // we don't advance to the next mapping if we've got an insertion (0 from length) next - (be_covered != base_edit.from_length() || !next_base_edit || - next_base_edit->from_length() > 0)) { - // appy to protobuf - assert(aug_mapping_it != aug_mappings.end()); - aug_mapping_it->clear_edit(); - for (auto& aug_edit : aug_mapping_edits) { - *aug_mapping_it->add_edit() = aug_edit; - } - aug_mapping_edits.clear(); - // advance to next input mapping - ++aug_mapping_it; - aug_mapping_len = aug_mapping_it != aug_mappings.end() ? mapping_from_length(*aug_mapping_it) : 0; - aug_mapping_used = 0; - } - } - } - - assert(total_aug_to_length == mapping_to_length(base_mapping)); - assert(total_aug_from_length == mapping_from_length(base_mapping)); -} - -void PileupAugmenter::map_paths() { - - // We don't remove any nodes, so paths always stay connected - function lambda = [&](const Path& path) { - list& call_path = _augmented_graph.graph.paths.create_path(path.name()); - map_path(path, call_path, false); - }; - _graph->paths.for_each(lambda); - - // make sure paths are saved - _augmented_graph.graph.paths.rebuild_node_mapping(); - _augmented_graph.graph.paths.rebuild_mapping_aux(); - _augmented_graph.graph.paths.to_graph(_augmented_graph.graph.graph); -} - -void PileupAugmenter::verify_path(const Path& in_path, const list& call_path) { - function lambda = [](VG* graph, const mapping_t& mapping) { - const Node* node = graph->get_node(mapping.node_id()); - return mapping_sequence(mapping.to_mapping(), *node); - }; - - string in_string; - for (int i = 0; i < in_path.mapping_size(); ++i) { - in_string += lambda(_graph, in_path.mapping(i)); - } - string call_string; - for (auto& m : call_path) { - call_string += lambda(&_augmented_graph.graph, m); - } - - assert(in_string == call_string); - -} - -void PileupAugmenter::create_augmented_edge(Node* node1, int from_offset, bool left_side1, bool aug1, - Node* node2, int to_offset, bool left_side2, bool aug2, char cat, - StrandSupport support) { - NodeDivider::Entry call_sides1; - NodeDivider::Entry call_sides2; - - if (aug1) { - call_sides1 = _node_divider.break_end(node1, &_augmented_graph.graph, from_offset, - left_side1); - } else { - call_sides1 = NodeDivider::Entry(node1, vector(1, support)); - } - if (aug2) { - call_sides2 = _node_divider.break_end(node2, &_augmented_graph.graph, to_offset, - left_side2); - } else { - call_sides2 = NodeDivider::Entry(node2, vector(1, support)); - } - - // make up to 9 edges connecting them in the call graph - for (int i = 0; i < (int)NodeDivider::EntryCat::Last; ++i) { - for (int j = 0; j < (int)NodeDivider::EntryCat::Last; ++j) { - if (call_sides1[i] != NULL && call_sides2[j] != NULL) { - // always make links between alts and reference - // (be more strict on deletion edges, only linking two reference) - bool link_edge = ((i == (int)NodeDivider::EntryCat::Ref && - j == (int)NodeDivider::EntryCat::Ref) || - ((i == (int)NodeDivider::EntryCat::Ref || - j == (int)NodeDivider::EntryCat::Ref) && - cat != 'L')); - - NodeSide side1(call_sides1[i]->id(), !left_side1); - NodeSide side2(call_sides2[j]->id(), !left_side2); - if (!_augmented_graph.graph.has_edge(side1, side2)) { - StrandSupport edge_support = support >= StrandSupport() ? support : - min(avgSup(call_sides1.sup(i)), avgSup(call_sides2.sup(j))); - - NodeOffSide no1(NodeSide(node1->id(), !left_side1), from_offset); - NodeOffSide no2(NodeSide(node2->id(), !left_side2), to_offset); - // take augmented deletion edge support from the pileup - if (cat == 'L') { - edge_support = _deletion_supports[minmax(no1, no2)]; - } - // hack to decrease support for an edge that spans an insertion, by subtracting - // that insertion's copy number. - auto is_it = _insertion_supports.find(minmax(no1, no2)); - if (is_it != _insertion_supports.end()) { - edge_support = edge_support - is_it->second; - } - - if (link_edge || edge_support.total() >= _min_aug_support) { - - Edge* edge = _augmented_graph.graph.create_edge(call_sides1[i], call_sides2[j], - left_side1, !left_side2); - - // TODO: can edges be annotated more than once with - // different cats? if so, last one will prevail. should - // check if this can impact vcf converter... - annotate_augmented_edge(edge, cat, edge_support); - } - } - } - } - } -} - -void PileupAugmenter::call_base_pileup(const NodePileup& np, int64_t offset, bool insertion) { - const BasePileup& bp = np.base_pileup(offset); - - // parse the pilueup structure - vector > base_offsets; - Pileups::parse_base_offsets(bp, base_offsets); - - // compute top two most frequent bases and their counts - string top_base; - int top_count; - int top_rev_count; - string second_base; - int second_count; - int second_rev_count; - int total_count; - compute_top_frequencies(bp, base_offsets, top_base, top_count, top_rev_count, - second_base, second_count, second_rev_count, total_count, - insertion); - - // note first and second base will be upper case too - string ref_base = string(1, ::toupper(bp.ref_base())); - - // get references to node-level members we want to update - Genotype& base_call = insertion ? _insert_calls[offset] : _node_calls[offset]; - pair& support = insertion ? _insert_supports[offset] : _node_supports[offset]; - - if (top_count >= _min_aug_support || (top_base == ref_base && top_count > 0)) { - base_call.first = top_base != ref_base ? top_base : "."; - support.first.fs = top_count - top_rev_count; - support.first.rs = top_rev_count; - support.first.qual = total_base_quality(bp, base_offsets, top_base); - } - if (second_count >= _min_aug_support || (second_base == ref_base && second_count > 0)) { - base_call.second = second_base != ref_base ? second_base : "."; - support.second.fs = second_count - second_rev_count; - support.second.rs = second_rev_count; - support.second.qual = total_base_quality(bp, base_offsets, second_base); - } -} - -void PileupAugmenter::compute_top_frequencies(const BasePileup& bp, - const vector >& base_offsets, - string& top_base, int& top_count, int& top_rev_count, - string& second_base, int& second_count, int& second_rev_count, - int& total_count, bool inserts) { - - // histogram of pileup entries (base, indel) - unordered_map hist; - // same thing but just reverse strand (used for strand bias filter) - unordered_map rev_hist; - - total_count = 0; - const string& bases = bp.bases(); - string ref_base = string(1, ::toupper(bp.ref_base())); - - // compute histogram from pileup - for (auto i : base_offsets) { - string val = Pileups::extract(bp, i.first); - - if ((inserts && val[0] != '+') || (!inserts && val[0] == '+')) { - // toggle inserts - continue; - } - - // We want to know if this pileup supports an N - if (is_all_n(val)) { - // N is not a real base, so we should never augment with it. - continue; - } - - ++total_count; - - // val will always be uppcase / forward strand. we check - // the original pileup to see if reversed - bool reverse = bases[i.first] == ',' || - (bases[i.first] == '+' && ::islower(bases[i.first + val.length() - 1])) || - (bases[i.first] != '-' && ::islower(bases[i.first])); - if (bases[i.first] == '-') { - string tok = Pileups::extract(bp, i.first); - bool is_reverse, from_start, to_end; - int64_t from_id, from_offset, to_id, to_offset; - Pileups::parse_delete(tok, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - reverse = is_reverse; - // reset reverse to forward - if (is_reverse) { - Pileups::make_delete(val, false, from_id, from_offset, from_start, to_id, to_offset, to_end); - } - } - - if (hist.find(val) == hist.end()) { - hist[val] = 0; - rev_hist[val] = 0; - } - ++hist[val]; - - if (reverse) { - ++rev_hist[val]; - } - } - - // tie-breaker heuristic: - // reference > transition > transversion > delete > insert > N - function base_priority = [&ref_base](const string& base) { - size_t n = base.length(); - if(n == 0) { - // base == '' -> Uncalled: 0 Points - return 0; - } - else - { - char cbase = base[0]; - - if(n == 1) - { - switch(cbase) { - case '.':// Ref: 6 Points - return 6; - break; - case '-': // Uncalled: 0 Points - return 0; - break; - - // Transition: 5 points. Transversion: 4 points - case 'A': - case 't': - return cbase == 'G' ? 5 : 4; - break; - - case 'C': - case 'g': - return cbase == 'T' ? 5 : 4; - break; - - case 'G': - case 'c': - return cbase == 'A' ? 5 : 4; - break; - - case 'T': - case 'a': - return cbase == 'C' ? 5 : 4; - break; - } - } - - // need to happen in any other case - i.e. also for n > 1 - switch(cbase) { - case '-': - return 3; // Deletion: 3 Points - break; - case '+': - return 2; // Insertion: 2 Points - break; - } - } - return 1; - }; - - // compare to pileup entries, to see which has greater count, use tie breaker logic - // if count is the same - function base_greater = [&base_priority] ( - const string& base1, int count1, const string& base2, int count2) { - if (count1 == count2) { - int p1 = base_priority(base1); - int p2 = base_priority(base2); - if (p1 == p2) { - return base1 > base2; - } else { - return p1 > p2; - } - } - return count1 > count2; - }; - - // find highest occurring string - top_base.clear(); - top_count = 0; - for (auto i : hist) { - if (base_greater(i.first, i.second, top_base, top_count)) { - top_base = i.first; - top_count = i.second; - } - } - - // find second highest occurring string - // todo: do it in same pass as above - second_base.clear(); - second_count = 0; - for (auto i : hist) { - if (i.first != top_base && - base_greater(i.first, i.second, second_base, second_count)) { - second_base = i.first; - second_count = i.second; - } - } - assert(top_base == "" || top_base != second_base); - top_rev_count = rev_hist[top_base]; - second_rev_count = rev_hist[second_base]; -} - -double PileupAugmenter::total_base_quality(const BasePileup& bp, - const vector >& base_offsets, - const string& val) { - double qual_sum = 0; - - const string& bases = bp.bases(); - const string& quals = bp.qualities(); - - for (int i = 0; i < base_offsets.size(); ++i) { - string base = Pileups::extract(bp, base_offsets[i].first); - - // make sure deletes always compared without is_reverse flag - if (base.length() > 1 && base[0] == '-') { - bool is_reverse, from_start, to_end; - int64_t from_id, from_offset, to_id, to_offset; - Pileups::parse_delete(base, is_reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - // reset reverse to forward - if (is_reverse) { - Pileups::make_delete(base, false, from_id, from_offset, from_start, to_id, to_offset, to_end); - } - } - - if (base == val) { - char qual = base_offsets[i].second >= 0 ? quals[base_offsets[i].second] : _default_quality; - qual_sum += (double)qual; - } - } - - return qual_sum; -} - -// please refactor me! -void PileupAugmenter::create_node_calls(const NodePileup& np) { - - int n = _node->sequence().length(); - const string& seq = _node->sequence(); - int cur = 0; - int cat = call_cat(_node_calls[cur]); - - // scan calls, merging contiguous reference calls. only consider - // ref / snp / inserts on first pass. - // scan contiguous chunks of a node with same call - // (note: snps will always be 1-base -- never merged) - for (int next = 1; next <= n; ++next) { - int next_cat = next == n ? -1 : call_cat(_node_calls[next]); - - // for anything but case where we merge consec. ref/refs - if (cat == 2 || cat != next_cat || - _insert_calls[next-1].first[0] == '+' || _insert_calls[next-1].second[0] == '+') { - - if (cat == 0 || cat == 1) { - // add reference - vector sup; - if (_node_calls[cur].first == ".") { - for (int i = cur; i < next; ++i) { - sup.push_back(_node_supports[i].first); - } - } - if (_node_calls[cur].second == ".") { - assert (_node_calls[cur].first != "."); - for (int i = cur; i < next; ++i) { - sup.push_back(_node_supports[i].second); - } - } - string new_seq = seq.substr(cur, next - cur); - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, NodeDivider::EntryCat::Ref, sup); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'R'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - else { - // some mix of reference and alts - assert(next == cur + 1); - - function call_het = - [&](string& call1, StrandSupport support1, string& call2, NodeDivider::EntryCat altCat) { - - if (call1 == "." || (altCat == NodeDivider::EntryCat::Alt1 && call2 != ".")) { - // reference base - StrandSupport sup = call1 == "." ? support1 : StrandSupport(); - assert(call2 != "."); // should be handled above - string new_seq = seq.substr(cur, 1); - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, NodeDivider::EntryCat::Ref, - vector(1, sup)); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'R'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - if (call1 != "." && call1[0] != '-' && call1[0] != '+' && ( - // we only want to process a homozygous snp once: - call1 != call2 || altCat == NodeDivider::EntryCat::Alt1)) { - StrandSupport sup = support1; - // snp base - string new_seq = call1; - Node* node = _augmented_graph.graph.create_node(new_seq, ++_max_id); - _node_divider.add_fragment(_node, cur, node, altCat, - vector(1, sup)); - // bridge to node - NodeOffSide no1(NodeSide(_node->id(), true), cur-1); - NodeOffSide no2(NodeSide(_node->id(), false), cur); - _augmented_edges[make_pair(no1, no2)] = 'S'; - // bridge from node - no1 = NodeOffSide(NodeSide(_node->id(), true), next-1); - no2 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no1, no2)] = 'S'; - } - else if (call1 != "." && call1[0] == '-' && call1.length() > 1 && ( - // we only want to process homozygous delete once - call1 != call2 || altCat == NodeDivider::EntryCat::Alt1)) { - // delete - int64_t del_len; - bool from_start; - int64_t from_id; - int64_t from_offset; - int64_t to_id; - int64_t to_offset; - bool to_end; - bool reverse; - Pileups::parse_delete(call1, reverse, from_id, from_offset, from_start, to_id, to_offset, to_end); - NodeOffSide s1(NodeSide(from_id, !from_start), from_offset); - NodeOffSide s2(NodeSide(to_id, to_end), to_offset); - Node* node1 = _graph->get_node(from_id); - assert(from_offset >=0 && from_offset < node1->sequence().length()); - Node* node2 = _graph->get_node(to_id); - assert(to_offset >=0 && to_offset < node2->sequence().length()); - - // we're just going to update the divider here, since all - // edges get done at the end - _augmented_edges[make_pair(s1, s2)] = 'L'; - // keep track of its support - _deletion_supports[minmax(s1, s2)] = support1; - - // also need to bridge any fragments created above - if ((from_start && from_offset > 0) || - (!from_start && from_offset < node1->sequence().length() - 1)) { - NodeOffSide no1(NodeSide(from_id, !from_start), from_offset); - NodeOffSide no2(NodeSide(from_id, from_start), - (from_start ? from_offset - 1 : from_offset + 1)); - if (_augmented_edges.find(make_pair(no1, no2)) == _augmented_edges.end()) { - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - } - if ((!to_end && to_offset > 0) || - (to_end && to_offset < node2->sequence().length() - 1)) { - NodeOffSide no1(NodeSide(to_id, to_end), to_offset); - NodeOffSide no2(NodeSide(to_id, !to_end), !to_end ? to_offset - 1 : to_offset + 1); - if (_augmented_edges.find(make_pair(no1, no2)) == _augmented_edges.end()) { - _augmented_edges[make_pair(no1, no2)] = 'R'; - } - - } - } - }; - - // apply same logic to both calls, updating opposite arrays - call_het(_node_calls[cur].first, _node_supports[cur].first, - _node_calls[cur].second, NodeDivider::EntryCat::Alt1); - call_het(_node_calls[cur].second, _node_supports[cur].second, - _node_calls[cur].first, NodeDivider::EntryCat::Alt2); - } - - // inserts done separate at end since they take start between cur and next - function call_inserts = - [&](string& ins_call1, StrandSupport ins_support1, string& ins_call2, StrandSupport ins_support2, - NodeDivider::EntryCat altCat) { - if (ins_call1[0] == '+' && ( - // we only want to process homozygous insert once - ins_call1 != ins_call2 || altCat == NodeDivider::EntryCat::Alt1)) { - int64_t ins_len; - string ins_seq; - bool ins_rev; - Pileups::parse_insert(ins_call1, ins_len, ins_seq, ins_rev); - // todo: check reverse? - Node* node = _augmented_graph.graph.create_node(ins_seq, ++_max_id); - StrandSupport sup = ins_support1; - InsertionRecord ins_rec = {node, sup, _node->id(), next-1}; - _inserted_nodes[node->id()] = ins_rec; - - // bridge to insert - NodeOffSide no1(NodeSide(_node->id(), true), next-1); - NodeOffSide no2(NodeSide(node->id(), false), 0); - _augmented_edges[make_pair(no1, no2)] = 'I'; - // bridge from insert - if (next < _node->sequence().length()) { - NodeOffSide no3 = NodeOffSide(NodeSide(node->id(), true), node->sequence().length() - 1); - NodeOffSide no4 = NodeOffSide(NodeSide(_node->id(), false), next); - _augmented_edges[make_pair(no3, no4)] = 'I'; - // bridge across insert - _augmented_edges[make_pair(no1, no4)] = 'R'; - // remember support "lost" to insertion so we - // can subtract it from the bridge later on - if (_insertion_supports.count(minmax(no1, no4))) { - _insertion_supports[minmax(no1, no4)] += sup; - } else { - _insertion_supports[minmax(no1, no4)] = sup; - } - } else { - // we have to link all outgoing edges to our insert if - // we're at end of node (unlike snps, the fragment structure doesn't - // handle these cases) - vector> next_nodes = _graph->edges_end(_node->id()); - NodeOffSide no3 = NodeOffSide(NodeSide(node->id(), true), node->sequence().length() - 1); - for (auto nn : next_nodes) { - int64_t offset4 = !nn.second ? 0 : _graph->get_node(nn.first)->sequence().length() - 1; - NodeOffSide no4 = NodeOffSide(NodeSide(nn.first, nn.second), offset4); - _augmented_edges[make_pair(no3, no4)] = 'I'; - // bridge across insert - _augmented_edges[make_pair(no1, no4)] = 'R'; - // remember support "lost" to insertion so we - // can subtract it from the bridge later on - if (_insertion_supports.count(minmax(no1, no4))) { - _insertion_supports[minmax(no1, no4)] += sup; - } else { - _insertion_supports[minmax(no1, no4)] = sup; - } - } - } - } - }; - - call_inserts(_insert_calls[next-1].first, _insert_supports[next-1].first, - _insert_calls[next-1].second, _insert_supports[next-1].second, - NodeDivider::EntryCat::Alt1); - call_inserts(_insert_calls[next-1].second, _insert_supports[next-1].second, - _insert_calls[next-1].first, _insert_supports[next-1].first, - NodeDivider::EntryCat::Alt2); - - // shift right - cur = next; - cat = next_cat; - } - } -} - -void PileupAugmenter::annotate_augmented_node(Node* node, char call, StrandSupport support, int64_t orig_id, int orig_offset) -{ - _augmented_graph.node_supports[node->id()].set_forward(support.fs); - _augmented_graph.node_supports[node->id()].set_reverse(support.rs); - _augmented_graph.node_supports[node->id()].set_quality(support.qual); - - if (orig_id != 0 && call != 'S' && call != 'I') { - // Add translations for preserved parts - Translation trans; - auto* new_mapping = trans.mutable_to()->add_mapping(); - new_mapping->mutable_position()->set_node_id(node->id()); - auto* new_edit = new_mapping->add_edit(); - new_edit->set_from_length(node->sequence().size()); - new_edit->set_to_length(node->sequence().size()); - auto* old_mapping = trans.mutable_from()->add_mapping(); - old_mapping->mutable_position()->set_node_id(orig_id); - old_mapping->mutable_position()->set_offset(orig_offset); - auto* old_edit = old_mapping->add_edit(); - old_edit->set_from_length(node->sequence().size()); - old_edit->set_to_length(node->sequence().size()); - - _augmented_graph.translator.translations.push_back(trans); - } -} - -void PileupAugmenter::annotate_augmented_edge(Edge* edge, char call, StrandSupport support) -{ - edge_t edge_handle = _graph->edge_handle(_graph->get_handle(edge->from(), edge->from_start()), - _graph->get_handle(edge->to(), edge->to_end())); - _augmented_graph.edge_supports[edge_handle].set_forward(support.fs); - _augmented_graph.edge_supports[edge_handle].set_reverse(support.rs); - _augmented_graph.edge_supports[edge_handle].set_quality(support.qual); -} - -void PileupAugmenter::annotate_augmented_nodes() -{ - for (auto& i : _node_divider.index) { - int64_t orig_node_id = i.first; - for (auto& j : i.second) { - int64_t orig_node_offset = j.first; - NodeDivider::Entry& entry = j.second; - char call = entry.sup_ref.empty() || maxSup(entry.sup_ref) == StrandSupport() ? 'U' : 'R'; - annotate_augmented_node(entry.ref, call, maxSup(entry.sup_ref), orig_node_id, orig_node_offset); - if (entry.alt1 != NULL) { - annotate_augmented_node(entry.alt1, 'S', maxSup(entry.sup_alt1), orig_node_id, orig_node_offset); - } - if (entry.alt2 != NULL) { - annotate_augmented_node(entry.alt2, 'S', maxSup(entry.sup_alt2), orig_node_id, orig_node_offset); - } - } - } -} - -void PileupAugmenter::annotate_non_augmented_nodes() { - _graph->for_each_node([&](Node* node) { - if (!_node_divider.index.count(node->id())) { - Translation trans; - auto* new_mapping = trans.mutable_to()->add_mapping(); - new_mapping->mutable_position()->set_node_id(node->id()); - auto* new_edit = new_mapping->add_edit(); - new_edit->set_from_length(node->sequence().size()); - new_edit->set_to_length(node->sequence().size()); - auto* old_mapping = trans.mutable_from()->add_mapping(); - old_mapping->mutable_position()->set_node_id(node->id()); - old_mapping->mutable_position()->set_offset(0); - auto* old_edit = old_mapping->add_edit(); - old_edit->set_from_length(node->sequence().size()); - old_edit->set_to_length(node->sequence().size()); - _augmented_graph.translator.translations.push_back(trans); - } - }); -} - -void NodeDivider::add_fragment(const Node* orig_node, int offset, Node* fragment, - EntryCat cat, vector sup) { - - NodeHash::iterator i = index.find(orig_node->id()); - if (i == index.end()) { - i = index.insert(make_pair(orig_node->id(), NodeMap())).first; - } - - NodeMap& node_map = i->second; - NodeMap::iterator j = node_map.find(offset); - - if (j != node_map.end()) { - assert(j->second[cat] == NULL); - j->second[cat] = fragment; - j->second.sup(cat) = sup; - } else { - Entry ins_triple; - ins_triple[cat] = fragment; - ins_triple.sup(cat) = sup; - j = node_map.insert(make_pair(offset, ins_triple)).first; - } - // sanity checks to make sure we don't introduce an overlap - if (offset == 0) { - assert(j == node_map.begin()); - } - if (offset + fragment->sequence().length() == orig_node->sequence().length()) { - assert(j == --node_map.end()); - } else if (j != --node_map.end()) { - NodeMap::iterator next = j; - ++next; - assert(offset + fragment->sequence().length() <= next->first); - } -} - -NodeDivider::Entry NodeDivider::break_end(const Node* orig_node, VG* graph, int offset, bool left_side) { - NodeHash::iterator i = index.find(orig_node->id()); - if (i == index.end()) { - return Entry(); - } - NodeMap& node_map = i->second; - NodeMap::iterator j = node_map.upper_bound(offset); - if (j == node_map.begin()) { - return Entry(); - } - - --j; - int sub_offset = j->first; - - function>(Node*, EntryCat, vector& )> lambda = - [&](Node* fragment, EntryCat cat, vector& sup) { - if (offset < sub_offset || offset >= sub_offset + fragment->sequence().length()) { - return make_pair((Node*)NULL, vector()); - } - - // if our cut point is already the exact left or right side of the node, then - // we don't have anything to do than return it. - if (offset == sub_offset && left_side == true) { - return make_pair(fragment, sup); - } - if (offset == sub_offset + fragment->sequence().length() - 1 && left_side == false) { - return make_pair(fragment, sup); - } - - // otherwise, we're somewhere in the middle, and have to subdivide the node - // first, shorten the exsisting node - int new_len = left_side ? offset - sub_offset : offset - sub_offset + 1; - assert(new_len > 0 && new_len != fragment->sequence().length()); - string frag_seq = fragment->sequence(); - *fragment->mutable_sequence() = frag_seq.substr(0, new_len); - - // then make a new node for the right part - Node* new_node = graph->create_node(frag_seq.substr(new_len, frag_seq.length() - new_len), ++(*_max_id)); - - // now divide up the support, starting with the right bit - vector new_sup; - if (!sup.empty()) { - new_sup = vector(sup.begin() + new_len, sup.end()); - // then cut the input (left bit) in place - sup.resize(new_len); - } - - // update the data structure with the new node - add_fragment(orig_node, sub_offset + new_len, new_node, cat, new_sup); - - return make_pair(new_node, new_sup); - }; - - vector& sup_ref = j->second.sup_ref; - Node* fragment_ref = j->second.ref; - auto new_node_info = fragment_ref != NULL ? lambda(fragment_ref, Ref, sup_ref) : - make_pair((Node*)NULL, vector()); - - vector& sup_alt1 = j->second.sup_alt1; - Node* fragment_alt1 = j->second.alt1; - auto new_node_alt1_info = fragment_alt1 != NULL ? lambda(fragment_alt1, Alt1, sup_alt1) : - make_pair((Node*)NULL, vector()); - - vector& sup_alt2 = j->second.sup_alt2; - Node* fragment_alt2 = j->second.alt2; - auto new_node_alt2_info = fragment_alt2 != NULL ? lambda(fragment_alt2, Alt2, sup_alt2) : - make_pair((Node*)NULL, vector()); - - Entry ret = left_side ? Entry(new_node_info.first, new_node_info.second, - new_node_alt1_info.first, new_node_alt1_info.second, - new_node_alt2_info.first, new_node_alt2_info.second) : - Entry(fragment_ref, sup_ref, fragment_alt1, sup_alt1, fragment_alt2, sup_alt2); - - return ret; -} - -// this function only works if node is completely covered in divider structure, -list NodeDivider::map_node(int64_t node_id, int64_t start_offset, int64_t length, bool reverse){ - NodeHash::iterator i = index.find(node_id); - assert(i != index.end()); - NodeMap& node_map = i->second; - assert(!node_map.empty()); - list out_mappings; - int cur_len = 0; - if (!reverse) { - for (auto i : node_map) { - Node* call_node = i.second.ref; - if (i.first + call_node->sequence().length() > start_offset && cur_len < length) { - assert(call_node != NULL); - Mapping mapping; - mapping.mutable_position()->set_node_id(call_node->id()); - if (start_offset > i.first && out_mappings.empty()) { - mapping.mutable_position()->set_offset(start_offset - i.first); - } else { - mapping.mutable_position()->set_offset(0); - } - int map_len = call_node->sequence().length() - mapping.position().offset(); - if (map_len + cur_len > length) { - map_len = length - cur_len; - } - assert(map_len > 0); - Edit* edit = mapping.add_edit(); - edit->set_from_length(map_len); - edit->set_to_length(map_len); - cur_len += map_len; - out_mappings.push_back(mapping); - } - } - } else { - // should fold into above when on less cold meds. - for (NodeMap::reverse_iterator i = node_map.rbegin(); i != node_map.rend(); ++i) - { - Node* call_node = i->second.ref; - if (i->first <= start_offset && cur_len < length) { - Mapping mapping; - mapping.mutable_position()->set_is_reverse(true); - mapping.mutable_position()->set_node_id(call_node->id()); - if (start_offset >= i->first && start_offset < i->first + call_node->sequence().length() - 1) { - assert(out_mappings.empty()); - mapping.mutable_position()->set_offset(start_offset - i->first); - } else { - mapping.mutable_position()->set_offset(call_node->sequence().length() - 1); - } - int map_len = mapping.position().offset() + 1; - if (map_len + cur_len > length) { - map_len = length - cur_len; - } - // switch up to new-style offset (todo: revise whole function) - mapping.mutable_position()->set_offset(call_node->sequence().length() - 1 - - mapping.position().offset()); - assert(map_len <= call_node->sequence().length()); - assert(mapping.position().offset() >= 0 && - mapping.position().offset() < call_node->sequence().length()); - Edit* edit = mapping.add_edit(); - edit->set_from_length(map_len); - edit->set_to_length(map_len); - cur_len += map_len; - out_mappings.push_back(mapping); - } - } - } - - assert(cur_len == length); - return out_mappings; -} - -void NodeDivider::clear() { - index.clear(); -} - -ostream& operator<<(ostream& os, const NodeDivider::NodeMap& nm) { - for (auto& x : nm) { - os << x.first << "[" << (x.second.ref ? x.second.ref->id() : -1) << ", " - << (x.second.alt1 ? x.second.alt1->id() : -1) << ", " - << (x.second.alt2 ? x.second.alt2->id() : -1) << "]" << endl; - } - return os; -} - -ostream& operator<<(ostream& os, NodeDivider::Entry entry) { - for (int i = 0; i < NodeDivider::EntryCat::Last; ++i) { - if (entry[i] != NULL) { - os << pb2json(*entry[i]); - } - else { - os << "NULL"; - } - os << ", "; - } - - return os; -} - -ostream& operator<<(ostream& os, const PileupAugmenter::NodeOffSide& no) { - os << "NOS(" << no.first.node << ":" << no.second << ",left=" << !no.first.is_end << ")"; - return os; -} - -} diff --git a/src/pileup_augmenter.hpp b/src/pileup_augmenter.hpp deleted file mode 100644 index a03f10592db..00000000000 --- a/src/pileup_augmenter.hpp +++ /dev/null @@ -1,326 +0,0 @@ -// Augments a graph using a pileup (made with vg pileup) - -#ifndef VG_CALLER_HPP_INCLUDED -#define VG_CALLER_HPP_INCLUDED - -#include -#include -#include -#include -#include -#include -#include -#include -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" -#include "pileup.hpp" -#include "path_index.hpp" -#include "genotypekit.hpp" -#include "option.hpp" - -namespace vg { - -using namespace std; - -// container for storing pairs of support for calls (value for each strand) -struct StrandSupport { - int fs; // forward support - int rs; // reverse support - double qual; // phred score (derived from sum log p-err over all observations) - StrandSupport(int f = 0, int r = 0, double q = 0) : - fs(f), rs(r), qual(q) {} - bool operator<(const StrandSupport& other) const { - if ((fs + rs) == (other.fs + other.rs)) { - // more strand bias taken as less support - return abs(fs - rs) > abs(other.fs - rs); - } - return fs + rs < other.fs + other.rs; - } - bool operator>=(const StrandSupport& other) const { - return !(*this < other); - } - bool operator==(const StrandSupport& other) const { - return fs == other.fs && rs == other.rs && qual == other.qual; - } - // min out at 0 - StrandSupport operator-(const StrandSupport& other) const { - return StrandSupport(max(0, fs - other.fs), max(0, rs - other.rs), - max(0., qual - other.qual)); - } - StrandSupport& operator+=(const StrandSupport& other) { - fs += other.fs; - rs += other.rs; - qual += other.qual; - return *this; - } - int total() { return fs + rs; } -}; - -inline StrandSupport minSup(vector& s) { - if (s.empty()) { - return StrandSupport(); - } - return *min_element(s.begin(), s.end()); -} -inline StrandSupport maxSup(vector& s) { - if (s.empty()) { - return StrandSupport(); - } - return *max_element(s.begin(), s.end()); -} -inline StrandSupport avgSup(vector& s) { - StrandSupport ret; - if (!s.empty()) { - for (auto sup : s) { - ret.fs += sup.fs; - ret.rs += sup.rs; - ret.qual += sup.qual; - } - ret.fs /= s.size(); - ret.rs /= s.size(); - ret.qual /= s.size(); - } - return ret; -} -inline StrandSupport totalSup(vector& s) { - StrandSupport ret; - if (!s.empty()) { - for (auto sup : s) { - ret.fs += sup.fs; - ret.rs += sup.rs; - ret.qual += sup.qual; - } - } - return ret; -} - -inline ostream& operator<<(ostream& os, const StrandSupport& sup) { - return os << sup.fs << ", " << sup.rs << ", " << sup.qual; -} - -// We need to break apart nodes but remember where they came from to update edges. -// Wrap all this up in this class. For a position in the input graph, we can have -// up to three nodes in the augmented graph (Ref, Alt1, Alt2), so we map to node -// triplets (Entry struct below). Note we never *call* all three nodes due to -// diploid assumption, but the augmented graph stores everything. -struct NodeDivider { - // up to three fragments per position in augmented graph (basically a Node 3-tuple, - // avoiding aweful C++ tuple syntax) - enum EntryCat {Ref = 0, Alt1, Alt2, Last}; - struct Entry { Entry(Node* r = 0, vector sup_r = vector(), - Node* a1 = 0, vector sup_a1 = vector(), - Node* a2 = 0, vector sup_a2 = vector()) : ref(r), alt1(a1), alt2(a2), - sup_ref(sup_r), sup_alt1(sup_a1), sup_alt2(sup_a2){} - Node* ref; Node* alt1; Node* alt2; - vector sup_ref; - vector sup_alt1; - vector sup_alt2; - Node*& operator[](int i) { - assert(i >= 0 && i <= 2); - return i == EntryCat::Ref ? ref : (i == EntryCat::Alt1 ? alt1 : alt2); - } - vector& sup(int i) { - assert(i >= 0 && i <= 2); - return i == EntryCat::Ref ? sup_ref : (i == EntryCat::Alt1 ? sup_alt1 : sup_alt2); - } - }; - // offset in original graph node -> up to 3 nodes in call graph - typedef map NodeMap; - // Node id in original graph to map above - typedef hash_map NodeHash; - NodeHash index; - int64_t* _max_id; - // map given node to offset i of node with id in original graph - // this function can never handle overlaps (and should only be called before break_end) - void add_fragment(const Node* orig_node, int offset, Node* subnode, EntryCat cat, vector sup); - // break node if necessary so that we can attach edge at specified side - // this function wil return NULL if there's no node covering the given location - Entry break_end(const Node* orig_node, VG* graph, int offset, bool left_side); - // assuming input node is fully covered, list of nodes that correspond to it in call graph - // if node not in structure at all, just return input (assumption uncalled nodes kept as is) - list map_node(int64_t node_id, int64_t start_offset, int64_t length, bool reverse); - // erase everything (but don't free any Node pointers, they belong to the graph) - void clear(); -}; -ostream& operator<<(ostream& os, const NodeDivider::NodeMap& nm); -ostream& operator<<(ostream& os, NodeDivider::Entry entry); - -/** - * Super simple graph augmentor/caller. - * Idea: Idependently process Pileup records, using simple model to make calls that - * take into account read errors with diploid assumption. Edges and node positions - * are called independently for now. - * Outputs either a sample graph (only called nodes and edges) or augmented graph - * (include uncalled nodes and edges too). - */ -class PileupAugmenter { -public: - - // log of zero - static const double Log_zero; - // use this score when pileup is missing quality - static const char Default_default_quality; - // don't augment graph without minimum support - static const int Default_min_aug_support; - - PileupAugmenter(VG* graph, - int default_quality = Default_default_quality, - int min_aug_support = Default_min_aug_support); - - ~PileupAugmenter(); - void clear(); - - // input graph - VG* _graph; - // Output augmented graph with annotations - SupportAugmentedGraph _augmented_graph; - - // buffer for base calls for each position in the node - // . = reference - // - = missing - typedef pair Genotype; - vector _node_calls; - vector > _node_supports; - // separate structure for isnertion calls since they - // don't really have reference coordinates (instead happen just to - // right of offset). - vector _insert_calls; - vector > _insert_supports; - // buffer for current node; - const Node* _node; - // max id in call_graph - int64_t _max_id; - // link called nodes back to the original graph. needed - // to figure out where edges go - NodeDivider _node_divider; - unordered_set _visited_nodes; - unordered_map, StrandSupport> _called_edges; // map to support - // deletes can don't necessarily need to be in incident to node ends - // so we throw in an offset into the mix. - typedef pair NodeOffSide; - // map a call category to an edge - typedef unordered_map, char> EdgeHash; - EdgeHash _augmented_edges; - // keep track of inserted nodes for tsv output - struct InsertionRecord { - Node* node; - StrandSupport sup; - int64_t orig_id; - int orig_offset; - }; - typedef unordered_map InsertionHash; - InsertionHash _inserted_nodes; - // hack for better estimating support for edges that go around - // insertions (between the adjacent ref nodes) - typedef unordered_map, StrandSupport> EdgeSupHash; - EdgeSupHash _insertion_supports; - - // need to keep track of support for augmented deletions - // todo: generalize augmented edge support - EdgeSupHash _deletion_supports; - - // maximum number of nodes to call before writing out output stream - int _buffer_size; - // if we don't have a mapping quality for a read position, use this - char _default_quality; - // minimum support to augment graph - int _min_aug_support; - - // write the augmented graph - void write_augmented_graph(ostream& out, bool json); - - // call every position in the node pileup - void call_node_pileup(const NodePileup& pileup); - - // call an edge. remembering it in a table for the whole graph - void call_edge_pileup(const EdgePileup& pileup); - - // fill in edges in the augmented graph (those that are incident to 2 call - // nodes) and add uncalled nodes (optionally) - void update_augmented_graph(); - - // map a path (can have edits, ie from Alignment) from base graph to augmented graph - // aug_path parameter is empty path that will be written to - void map_path(const Path& base_path, list& aug_path, bool expect_edits); - - // Apply edits from base_mapping to corresponding augmented mappings that share same - // from interval, but don't yet have edits (called by map_path); - void apply_mapping_edits(const Mapping& base_mapping, list& aug_mappings); - - // TODO: - // method to normalize mapped paths back onto the augmented graph. ie check each - // non-match edit to see if it can be turned into a match on the augmented graph. - - // map paths from input graph into called (augmented) graph - void map_paths(); - // make sure mapped paths generate same strings as input paths - void verify_path(const Path& in_path, const list& call_path); - - // call position at given base - // if insertion flag set to true, call insertion between base and next base - void call_base_pileup(const NodePileup& np, int64_t offset, bool insertions); - - // Find the top-two bases in a pileup, along with their counts - // Last param toggles whether we consider only inserts or everything else - // (do not compare all at once since inserts do not have reference coordinates) - void compute_top_frequencies(const BasePileup& bp, - const vector >& base_offsets, - string& top_base, int& top_count, int& top_rev_count, - string& second_base, int& second_count, int& second_rev_count, - int& total_count, bool inserts); - - // Sum up the qualities of a given symbol in a pileup - double total_base_quality(const BasePileup& pb, - const vector >& base_offsets, - const string& val); - - // write graph structure corresponding to all the calls for the current - // node. - void create_node_calls(const NodePileup& np); - - void create_augmented_edge(Node* node1, int from_offset, bool left_side1, bool aug1, - Node* node2, int to_offset, bool left_side2, bool aug2, char cat, - StrandSupport support); - - // Annotate nodes and edges in the augmented graph with call info. - void annotate_augmented_node(Node* node, char call, StrandSupport support, int64_t orig_id, int orig_offset); - void annotate_augmented_edge(Edge* edge, char call, StrandSupport support); - void annotate_augmented_nodes(); - - // Add nodes that are passed through as-is (ie not augmented at all) to the translation table - void annotate_non_augmented_nodes(); - - // log function that tries to avoid 0s - static double safe_log(double v) { - return v == 0. ? Log_zero : ::log10(v); - } - - // call missing - static bool missing_call(const Genotype& g) { - return g.first == "-" && g.second == "-"; - } - - // call is reference - static bool ref_call(const Genotype& g) { - return g.first == "." && (g.second == "." || g.second == "-"); - } - - // classify call as 0: missing 1: reference 2: snp - // (holdover from before indels) - static int call_cat(const Genotype&g) { - if (missing_call(g)) { - return 0; - } else if (ref_call(g)) { - return 1; - } - return 2; - } -}; - -ostream& operator<<(ostream& os, const PileupAugmenter::NodeOffSide& no); - - -} - -#endif diff --git a/src/subcommand/augment_main.cpp b/src/subcommand/augment_main.cpp index 18385043264..f7e9fdc1b42 100644 --- a/src/subcommand/augment_main.cpp +++ b/src/subcommand/augment_main.cpp @@ -21,32 +21,26 @@ #include #include "subcommand.hpp" - #include "../option.hpp" - +#include "../xg.hpp" #include "../vg.hpp" -#include "../pileup_augmenter.hpp" - +#include "../augment.hpp" +#include +#include +#include +#include "bdsg/packed_graph.hpp" +#include "bdsg/hash_graph.hpp" +#include "bdsg/odgi.hpp" using namespace std; using namespace vg; using namespace vg::subcommand; -// this used to be pileup_main() -static Pileups* compute_pileups(VG* graph, const string& gam_file_name, int thread_count, int min_quality, - int max_mismatches, int window_size, int max_depth, bool use_mapq, - bool strict_edge_support, bool show_progress); - -// this used to be the first half of call_main() -static void augment_with_pileups(PileupAugmenter& augmenter, Pileups& pileups, bool expect_subgraph, - bool show_progress); - void help_augment(char** argv, ConfigurableParser& parser) { cerr << "usage: " << argv[0] << " augment [options] [alignment.gam] > augmented_graph.vg" << endl << "Embed GAM alignments into a graph to facilitate variant calling" << endl << endl << "general options:" << endl - << " -a, --augmentation-mode M augmentation mode. M = {pileup, direct} [direct]" << endl << " -i, --include-paths merge the paths implied by alignments into the graph" << endl << " -C, --cut-softclips drop softclips from the paths (recommended)" << endl << " -B, --label-paths don't augment with alignments, just use them for labeling the graph" << endl @@ -58,18 +52,7 @@ void help_augment(char** argv, ConfigurableParser& parser) { << " -t, --threads N number of threads to use" << endl << "loci file options:" << endl << " -l, --include-loci FILE merge all alleles in loci into the graph" << endl - << " -L, --include-gt FILE merge only the alleles in called genotypes into the graph" << endl - << "pileup options:" << endl - << " -P, --pileup FILE save pileups to FILE" << endl - << " -S, --support FILE save supports to FILE" << endl - << " -g, --min-aug-support N minimum support to augment graph [" - << PileupAugmenter::Default_min_aug_support << "]" << endl - << " -U, --subgraph expect a subgraph and ignore extra pileup entries outside it" << endl - << " -q, --min-quality N ignore bases with PHRED quality < N (default=0)" << endl - << " -m, --max-mismatches N ignore bases with > N mismatches within window centered on read (default=1)" << endl - << " -w, --window-size N size of window to apply -m option (default=0)" << endl - << " -M, --ignore-mapq do not combine mapping qualities with base qualities in pileup" << endl - << " -r, --recall recall mode: compute supports but don't add structure to the graph" << endl; + << " -L, --include-gt FILE merge only the alleles in called genotypes into the graph" << endl; // Then report more options parser.print_help(cerr); @@ -77,18 +60,6 @@ void help_augment(char** argv, ConfigurableParser& parser) { int main_augment(int argc, char** argv) { - // augmentation mode - string augmentation_mode = "direct"; - - // load pileupes from here - string pileup_file_name; - - // minimum support to consider adding a variant to the graph - int min_aug_support = PileupAugmenter::Default_min_aug_support; - - // Should we expect a subgraph and ignore pileups for missing nodes/edges? - bool expect_subgraph = false; - // Write the translations (as protobuf) to this path string translation_file_name; @@ -106,9 +77,6 @@ int main_augment(int argc, char** argv) { // Merge only alleles from called genotypes in the loci file bool called_genotypes_only = false; - - // Write the supports (as protobuf) to this path - string support_file_name; // Load in GAM alignments to map over to the augmented graph from here string gam_in_file_name; @@ -125,31 +93,10 @@ int main_augment(int argc, char** argv) { // Number of threads to use (will default to all if not specified) int thread_count = 0; - // Bases with quality less than 10 will not be added to the pileup - int min_quality = 0; - - // Bases with more than this many mismatches within the window_size not added - int max_mismatches = 1; - - // Window size for above (0 effectively turns this check off) - int window_size = 0; - - // Hack to prevent protobuf messages from getting too big by limiting depth at - // any given position to max_depth - int max_depth = 1000; - - // Combine MAPQ and PHRED base qualities to determine quality at each position - // If false, only PHRED base quality will be used. - bool use_mapq = true; - - // Recall mode: compute supports so vg call can be run, but don't actually augment - // the graph - bool recall_mode = false; - - static const struct option long_options[] = { - // General Options + // Deprecated Options {"augmentation-mode", required_argument, 0, 'a'}, + // General Options {"translation", required_argument, 0, 'Z'}, {"alignment-out", required_argument, 0, 'A'}, {"include-paths", no_argument, 0, 'i'}, @@ -162,19 +109,9 @@ int main_augment(int argc, char** argv) { // Loci Options {"include-loci", required_argument, 0, 'l'}, {"include-gt", required_argument, 0, 'L'}, - // Pileup Options - {"pileup", required_argument, 0, 'P'}, - {"support", required_argument, 0, 'S'}, - {"min-quality", required_argument, 0, 'q'}, - {"max-mismatches", required_argument, 0, 'm'}, - {"window-size", required_argument, 0, 'w'}, - {"ignore-mapq", no_argument, 0, 'M'}, - {"min-aug-support", required_argument, 0, 'g'}, - {"subgraph", no_argument, 0, 'U'}, - {"recall", no_argument, 0, 'r'}, {0, 0, 0, 0} }; - static const char* short_options = "a:Z:A:iBhpvt:l:L:P:S:q:m:w:Mg:UrC"; + static const char* short_options = "a:Z:A:iCBhpvt:l:L:"; optind = 2; // force optind past command positional arguments // This is our command-line parser @@ -182,10 +119,11 @@ int main_augment(int argc, char** argv) { // Parse all the options we have defined here. switch (c) { - // General Options + // Deprecated. case 'a': - augmentation_mode = optarg; + cerr << "[vg augment] warning: -a / --augmentation-mode option is deprecated" << endl; break; + // General Options case 'Z': translation_file_name = optarg; break; @@ -226,35 +164,6 @@ int main_augment(int argc, char** argv) { called_genotypes_only = true; break; - // Pileup Options - case 'P': - pileup_file_name = optarg; - break; - case 'S': - support_file_name = optarg; - break; - case 'q': - min_quality = parse(optarg); - break; - case 'm': - max_mismatches = parse(optarg); - break; - case 'w': - window_size = parse(optarg); - break; - case 'M': - use_mapq = false; - break; - case 'g': - min_aug_support = parse(optarg); - break; - case 'U': - expect_subgraph = true; - break; - case 'r': - recall_mode = true; - break; - default: abort (); } @@ -289,375 +198,146 @@ int main_augment(int argc, char** argv) { cerr << "[vg augment] error: graph and gam can't both be from stdin." << endl; return 1; } - if (gam_in_file_name == "-" && !include_paths && !label_paths && augmentation_mode == "direct") { - cerr << "[vg augment] error: cannot stream input gam when not using -i option (as it requires 2 passes)" << endl; - return 1; - } - - if (augmentation_mode != "pileup" && augmentation_mode != "direct") { - cerr << "[vg augment] error: pileup and direct are currently the only supported augmentation modes (-a)" << endl; - return 1; - } - - if (augmentation_mode != "direct" and !gam_out_file_name.empty()) { - cerr << "[vg augment] error: GAM output only works with \"direct\" augmentation mode" << endl; - return 1; - } - - if (augmentation_mode != "pileup" and (!support_file_name.empty() || !pileup_file_name.empty())) { - cerr << "[vg augment] error: Pileup (-P) and Support (-S) output only work with \"pileup\" augmentation mode" << endl; - return 1; - } - if (label_paths && (!gam_out_file_name.empty() || !translation_file_name.empty())) { cerr << "[vg augment] error: Translation (-Z) and GAM (-A) output do not work with \"label-only\" (-B) mode" << endl; return 1; } - + if (gam_in_file_name == "-" && !label_paths) { + cerr << "[vg augment] warning: reading the entire GAM from stdin into memory. it is recommended to pass in" + << " a filename rather than - so it can be streamed over two passes" << endl; + } + // read the graph if (show_progress) { cerr << "Reading input graph" << endl; } - VG* graph; + + // Read the graph + unique_ptr graph; get_input_file(graph_file_name, [&](istream& in) { - graph = new VG(in); - }); - - - Pileups* pileups = nullptr; + graph = vg::io::VPKG::load_one(in); + }); + VG* vg_graph = dynamic_cast(graph.get()); - if (!pileup_file_name.empty() || augmentation_mode == "pileup") { - // We will need the computed pileups - - // compute the pileups from the graph and gam - pileups = compute_pileups(graph, gam_in_file_name, thread_count, min_quality, max_mismatches, - window_size, max_depth, use_mapq, !recall_mode, show_progress); - } - - if (!pileup_file_name.empty()) { - // We want to write out pileups. - if (show_progress) { - cerr << "Writing pileups" << endl; - } - ofstream pileup_file(pileup_file_name); - if (!pileup_file) { - cerr << "[vg augment] error: unable to open output pileup file: " << pileup_file_name << endl; - exit(1); + if (label_paths) { + // Just add path names with extend() + get_input_file(gam_in_file_name, [&](istream& alignment_stream) { + vg::io::for_each(alignment_stream, [&](Alignment& alignment) { + if (!include_softclips) { + softclip_trim(alignment); + } + Path simplified_path = simplify(alignment.path()); + *simplified_path.mutable_name() = alignment.name(); + add_path_to_graph(graph.get(), simplified_path); + }); + }); + if (vg_graph != nullptr) { + vg_graph->paths.sort_by_mapping_rank(); + vg_graph->paths.rebuild_mapping_aux(); } - pileups->write(pileup_file); } - - if (augmentation_mode == "direct" && !gam_in_file_name.empty()) { - // Augment with the reads - - if (!support_file_name.empty()) { - cerr << "[vg augment] error: support calculation in direct augmentation mode is unimplemented" << endl; - exit(1); - } - - // We don't need any pileups - if (pileups != nullptr) { - delete pileups; - pileups = nullptr; - } - + else { + // Actually do augmentation vector translation; - - // Just add path names with extend() - if (label_paths) { - get_input_file(gam_in_file_name, [&](istream& alignment_stream) { - vg::io::for_each(alignment_stream, [&](Alignment& alignment) { - if (!include_softclips) { - softclip_trim(alignment); - } - Path simplified_path = simplify(alignment.path()); - *simplified_path.mutable_name() = alignment.name(); - graph->paths.extend(simplified_path, false, false); - }); - }); - graph->paths.sort_by_mapping_rank(); - graph->paths.rebuild_mapping_aux(); - } - // We use the old interface when streaming in paths to include in order to preserve - // backward compatibility (with vg mod - -i aln.gam ), as the new interface can't read from stdin. - else if (include_paths && gam_in_file_name == "-") { - // Load all the reads - vector reads; - // And pull out their paths - vector read_paths; - vg::io::for_each(std::cin, (function)[&](Alignment& aln) { - if (!include_softclips) { - softclip_trim(aln); - } - read_paths.push_back(aln.path()); - reads.push_back(aln); - *reads.back().mutable_path() = Path(); - }); - graph->edit(read_paths, - !translation_file_name.empty() ? &translation : nullptr, - include_paths, - !gam_out_file_name.empty(), - false); - if (!gam_out_file_name.empty()) { - ofstream gam_out_file(gam_out_file_name); - vector gam_buffer; - for (size_t i = 0; i < reads.size(); i++) { - // Say we are going to write out the alignment - gam_buffer.push_back(reads[i]); - // Set its path to the corrected embedded path - *gam_buffer.back().mutable_path() = read_paths[i]; - // Write it back out - vg::io::write_buffered(gam_out_file, gam_buffer, 100); - } - // Flush the buffer - vg::io::write_buffered(gam_out_file, gam_buffer, 0); - } - } - // The new streaming interface doesn't keep the GAM in memory. It does make two passes of it though, - // so it won't work with stdin - else { - assert(gam_in_file_name != "-"); - ifstream gam_in_file(gam_in_file_name); - ofstream gam_out_file; - if (!gam_out_file_name.empty()) { - gam_out_file.open(gam_out_file_name); - } - graph->edit(gam_in_file, - !translation_file_name.empty() ? &translation : nullptr, - include_paths, - !gam_out_file_name.empty() ? &gam_out_file : nullptr, - false, !include_softclips); - } - - // Write the augmented graph - if (show_progress) { - cerr << "Writing augmented graph" << endl; - } - graph->serialize_to_ostream(cout); - - if (!translation_file_name.empty()) { - // Write the translations - if (show_progress) { - cerr << "Writing translation table" << endl; - } - ofstream translation_file(translation_file_name); - if (!translation_file) { - cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl; - return 1; - } - vg::io::write_buffered(translation_file, translation, 0); - translation_file.close(); - } - - } else if (augmentation_mode == "pileup") { - // We want to augment with pileups - - // The PileupAugmenter object will take care of all augmentation - PileupAugmenter augmenter(graph, PileupAugmenter::Default_default_quality, - recall_mode ? numeric_limits::max() : min_aug_support); - - // compute the augmented graph from the pileup - // Note: we can save a fair bit of memory by clearing pileups, and re-reading off of - // pileup_file_name - augment_with_pileups(augmenter, *pileups, expect_subgraph, show_progress); - delete pileups; - pileups = nullptr; - - // write the augmented graph - if (show_progress) { - cerr << "Writing augmented graph" << endl; - } - augmenter.write_augmented_graph(cout, false); - - // write the agumented gam + ofstream gam_out_file; if (!gam_out_file_name.empty()) { - ofstream gam_out_file(gam_out_file_name); + gam_out_file.open(gam_out_file_name); if (!gam_out_file) { - cerr << "[vg augment]: Error opening output GAM file: " << gam_out_file_name << endl; + cerr << "[vg augment] error: could not open output GAM file: " << gam_out_file_name << endl; return 1; } - get_input_file(gam_in_file_name, [&](istream& alignment_stream) { - vector gam_buffer; - function lambda = [&gam_out_file, &gam_buffer, &augmenter](Alignment& alignment) { - list aug_path; - augmenter.map_path(alignment.path(), aug_path, true); - alignment.mutable_path()->clear_mapping(); - for (auto& aug_mapping : aug_path) { - *alignment.mutable_path()->add_mapping() = aug_mapping.to_mapping(); + } + if (gam_in_file_name == "-" || !loci_file.empty()) { + vector buffer; + if (gam_in_file_name == "-") { + // this is usually bad news, but we gave a warning + get_input_file(gam_in_file_name, [&](istream& alignment_stream) { + vg::io::for_each(alignment_stream, [&](Alignment& alignment) { + Path& path = *alignment.mutable_path(); + path.set_name(alignment.name()); + buffer.push_back(path); + }); + }); + } else if (!loci_file.empty()) { + function lambda = [&graph, &buffer, &called_genotypes_only](Locus& locus) { + // if we are only doing called genotypes, record so we can filter alleles + set alleles_in_genotype; + if (called_genotypes_only) { + for (int i = 0; i < locus.genotype_size(); ++i) { + for (int j = 0; j < locus.genotype(i).allele_size(); ++j) { + alleles_in_genotype.insert(locus.genotype(i).allele(j)); + } + } + } + for (int i = 0; i < locus.allele_size(); ++i) { + // skip alleles not in the genotype if using only called genotypes + if (!alleles_in_genotype.empty()) { + if (!alleles_in_genotype.count(i)) continue; + } + Path path = simplify(locus.allele(i)); + stringstream name; + name << locus.name() << ":" << i; + path.set_name(name.str()); + buffer.push_back(path); } - gam_buffer.push_back(alignment); - vg::io::write_buffered(gam_out_file, gam_buffer, 100); }; - vg::io::for_each(alignment_stream, lambda); - vg::io::write_buffered(gam_out_file, gam_buffer, 0); - }); + get_input_file(loci_file, [&](istream& loci_stream) { + vg::io::for_each(loci_stream, lambda); + }); + } + + augment(graph.get(), + buffer, + translation_file_name.empty() ? nullptr : &translation, + gam_out_file_name.empty() ? nullptr : &gam_out_file, + include_paths, + include_paths, + !include_softclips); + } else { + // much better to stream from a file so we can do two passes without storing in memory + get_input_file(gam_in_file_name, [&](istream& alignment_stream) { + augment(graph.get(), + alignment_stream, + translation_file_name.empty() ? nullptr : &translation, + gam_out_file_name.empty() ? nullptr : &gam_out_file, + include_paths, + include_paths, + !include_softclips); + }); } - // write the translation + // we don't have a streaming interface for translation: write the buffer now if (!translation_file_name.empty()) { - // write the translations + // Write the translations if (show_progress) { cerr << "Writing translation table" << endl; } ofstream translation_file(translation_file_name); if (!translation_file) { - cerr << "[vg augment] error: error opening translation file: " << translation_file_name << endl; + cerr << "[vg augment]: Error opening translation file: " << translation_file_name << endl; return 1; } - augmenter._augmented_graph.write_translations(translation_file); + vg::io::write_buffered(translation_file, translation, 0); translation_file.close(); } - - // write the supports - if (!support_file_name.empty()) { - // write the supports - if (show_progress) { - cerr << "Writing supports" << endl; - } - ofstream support_file(support_file_name); - if (!support_file) { - cerr << "[vg augment] error: error opening supports file: " << support_file_name << endl; - return 1; - } - augmenter._augmented_graph.write_supports(support_file); - support_file.close(); - } - } else if (!loci_file.empty()) { - // read in the alignments and save their paths - vector paths; - function lambda = [&graph, &paths, &called_genotypes_only](Locus& locus) { - // if we are only doing called genotypes, record so we can filter alleles - set alleles_in_genotype; - if (called_genotypes_only) { - for (int i = 0; i < locus.genotype_size(); ++i) { - for (int j = 0; j < locus.genotype(i).allele_size(); ++j) { - alleles_in_genotype.insert(locus.genotype(i).allele(j)); - } - } - } - for (int i = 0; i < locus.allele_size(); ++i) { - // skip alleles not in the genotype if using only called genotypes - if (!alleles_in_genotype.empty()) { - if (!alleles_in_genotype.count(i)) continue; - } - Path path = simplify(locus.allele(i)); - stringstream name; - name << locus.name() << ":" << i; - path.set_name(name.str()); - paths.push_back(path); - } - }; - if (loci_file == "-") { - vg::io::for_each(std::cin, lambda); - } else { - ifstream in; - in.open(loci_file.c_str()); - vg::io::for_each(in, lambda); - } - // execute the edits and produce the translation if requested. - // Make sure to break at node ends, but don't add any paths because they're just loci alleles and not real paths. - vector translation; - graph->edit(paths, &translation, false, false, true); - if (!translation_file_name.empty()) { - ofstream out(translation_file_name); - vg::io::write_buffered(out, translation, 0); - out.close(); - } - graph->serialize_to_ostream(cout); + } + + // Serialize the graph using VPKG. Todo: is there away to do this in one line? + // could just call serialie() directly if willing to forego vpkg... + if (vg_graph != nullptr) { + vg::io::VPKG::save(*vg_graph, cout); + } else if (dynamic_cast(graph.get()) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + } else if (dynamic_cast(graph.get()) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + } else if (dynamic_cast(graph.get()) != nullptr) { + vg::io::VPKG::save(*dynamic_cast(graph.get()), cout); + } else { + throw runtime_error("Internal error: vg augment cannot output this graph format"); } - - if (pileups != nullptr) { - delete pileups; - pileups = nullptr; - } - delete graph; - return 0; } -Pileups* compute_pileups(VG* graph, const string& gam_file_name, int thread_count, int min_quality, - int max_mismatches, int window_size, int max_depth, bool use_mapq, - bool strict_edge_support, bool show_progress) { - - // Make Pileups makers for each thread. - vector pileups; - for (int i = 0; i < thread_count; ++i) { - pileups.push_back(new Pileups(graph, min_quality, max_mismatches, window_size, max_depth, use_mapq, - strict_edge_support)); - } - - // setup alignment stream - get_input_file(gam_file_name, [&](istream& alignment_stream) { - // compute the pileups. - if (show_progress) { - cerr << "Computing pileups" << endl; - } - - function lambda = [&pileups, &graph](Alignment& aln) { - int tid = omp_get_thread_num(); - pileups[tid]->compute_from_alignment(aln); - }; - vg::io::for_each_parallel(alignment_stream, lambda); - }); - - // single-threaded (!) merge - if (show_progress && pileups.size() > 1) { - cerr << "Merging pileups" << endl; - } - for (int i = 1; i < pileups.size(); ++i) { - pileups[0]->merge(*pileups[i]); - delete pileups[i]; - } - return pileups[0]; -} - -void augment_with_pileups(PileupAugmenter& augmenter, Pileups& pileups, bool expect_subgraph, - bool show_progress) { - - if (show_progress) { - cerr << "Computing augmented graph from the pileup" << endl; - } - - pileups.for_each_node_pileup([&](const NodePileup& node_pileup) { - if (!augmenter._graph->has_node(node_pileup.node_id())) { - // This pileup doesn't belong in this graph - if(!expect_subgraph) { - throw runtime_error("Found pileup for nonexistent node " + to_string(node_pileup.node_id())); - } - // If that's expected, just skip it - return; - } - // Send approved pileups to the augmenter - augmenter.call_node_pileup(node_pileup); - - }); - - pileups.for_each_edge_pileup([&](const EdgePileup& edge_pileup) { - if (!augmenter._graph->has_edge(edge_pileup.edge())) { - // This pileup doesn't belong in this graph - if(!expect_subgraph) { - throw runtime_error("Found pileup for nonexistent edge " + pb2json(edge_pileup.edge())); - } - // If that's expected, just skip it - return; - } - // Send approved pileups to the augmenter - augmenter.call_edge_pileup(edge_pileup); - }); - - // map the edges from original graph - if (show_progress) { - cerr << "Mapping edges into augmented graph" << endl; - } - augmenter.update_augmented_graph(); - - // map the paths from the original graph - if (show_progress) { - cerr << "Mapping paths into augmented graph" << endl; - } - augmenter.map_paths(); -} - // Register subcommand static Subcommand vg_augment("augment", "augment a graph from an alignment", PIPELINE, 5, main_augment); diff --git a/src/subcommand/call_main_old.cpp b/src/subcommand/call_main_old.cpp deleted file mode 100644 index 4ddbbbae016..00000000000 --- a/src/subcommand/call_main_old.cpp +++ /dev/null @@ -1,257 +0,0 @@ -/** \file call_main.cpp - * - * Defines the "vg call" subcommand, which calls variation from an augmented graph - */ - -#include -#include -#include - -#include -#include - -#include "subcommand.hpp" - -#include "../option.hpp" - -#include "../vg.hpp" -#include "../xg.hpp" -#include "../support_caller.hpp" -#include -#include - - - -using namespace std; -using namespace vg; -using namespace vg::subcommand; - -void help_call_old(char** argv, ConfigurableParser& parser) { - cerr << "usage: " << argv[0] << " call [options] > output.vcf" << endl - << "Output variant calls in VCF or Loci format given a graph and pileup" << endl - << endl - << "genotyper options:" << endl - - << "general options:" << endl - << " -z, --translation FILE input translation table" << endl - << " -b, --base-graph FILE base graph. currently needed for XREF tag" << endl - << " -g, --snarls FILE snarls of input graph (to save re-computing)" << endl - << " -h, --help print this help message" << endl - << " -p, --progress show progress" << endl - << " -v, --verbose print information and warnings about vcf generation" << endl - << " -t, --threads N number of threads to use" << endl; - - // Then report more options - parser.print_help(cerr); -} - -int main_call_old(int argc, char** argv) { - - string translation_file_name; - - // todo: get rid of this. only used to check if deletion edge is novel. must be some - // way to get that out of the translations - string base_graph_file_name; - - string snarl_file_name; - - // This manages conversion from an augmented graph to a VCF, and makes the - // actual calls. - SupportCaller support_caller; - - bool show_progress = false; - int thread_count = 0; - - static const struct option long_options[] = { - {"base-graph", required_argument, 0, 'b'}, - {"translation", required_argument, 0, 'z'}, - {"snarls", required_argument, 0, 'g'}, - {"progress", no_argument, 0, 'p'}, - {"verbose", no_argument, 0, 'v'}, - {"threads", required_argument, 0, 't'}, - {"help", no_argument, 0, 'h'}, - {0, 0, 0, 0} - }; - static const char* short_options = "z:b:pvt:hg:"; - optind = 2; // force optind past command positional arguments - - // This is our command-line parser - ConfigurableParser parser(short_options, long_options, [&](int c) { - // Parse all the options we have defined here. - switch (c) - { - case 'z': - translation_file_name = optarg; - break; - case 'b': - base_graph_file_name = optarg; - break; - case 'g': - snarl_file_name = optarg; - break; - case 'p': - show_progress = true; - break; - case 'v': - support_caller.verbose = true; - break; - case 't': - thread_count = parse(optarg); - break; - case 'h': - case '?': - /* getopt_long already printed an error message. */ - help_call_old(argv, parser); - exit(1); - break; - default: - abort (); - } - }); - // Register the support_caller converter for configuring with its options. - parser.register_configurable(&support_caller); - - if (argc <= 3) { - help_call_old(argv, parser); - return 1; - } - - // Parse the command line options, updating optind. - parser.parse(argc, argv); - - if (thread_count != 0) { - // Use a non-default number of threads - omp_set_num_threads(thread_count); - } - thread_count = get_thread_count(); - - // Parse the arguments - if (optind >= argc) { - help_call_old(argv, parser); - return 1; - } - string graph_file_name = get_input_file_name(optind, argc, argv); - - if (string(support_caller.support_file_name).empty() == - string(support_caller.pack_file_name).empty()) { - cerr << "[vg call]: Support file must be specified with either -s (or -P)" << endl; - return 1; - } - - - if (string(support_caller.pack_file_name).empty() && translation_file_name.empty()) { - cerr << "[vg call]: Translation file must be specified with -Z" << endl; - return 1; - } - - // read the vcf and accompanying fasta files if specified - if (!((string)(support_caller.recall_vcf_filename)).empty()) { - if (support_caller.variant_offset != 0) { - cerr << "error: [vg call] vcf offset (-o) not supported in recall mode (-f)" << endl; - return 1; - } - support_caller.variant_file.parseSamples = false; - support_caller.variant_file.open(support_caller.recall_vcf_filename); - if (!support_caller.variant_file.is_open()) { - cerr << "error: [vg call] could not open " << (string)support_caller.recall_vcf_filename << endl; - return 1; - } - - // load up the fasta - if (!((string)support_caller.recall_ref_fasta_filename).empty()) { - support_caller.ref_fasta = unique_ptr(new FastaReference); - support_caller.ref_fasta->open(support_caller.recall_ref_fasta_filename); - } - if (!((string)support_caller.recall_ins_fasta_filename).empty()) { - support_caller.ins_fasta = unique_ptr(new FastaReference); - support_caller.ins_fasta->open(support_caller.recall_ins_fasta_filename); - } - } - - // read the graph - if (show_progress) { - cerr << "Reading input graph" << endl; - } - VG* graph; - get_input_file(graph_file_name, [&](istream& in) { - graph = new VG(in); - }); - - // and the base graph - VG* base_graph = NULL; - if (!base_graph_file_name.empty()) { - get_input_file(base_graph_file_name, [&](istream& in) { - base_graph = new VG(in); - }); - } - - SupportAugmentedGraph augmented_graph; - // Move our input graph into the augmented graph - swap(augmented_graph.graph, *graph); - - augmented_graph.base_graph = base_graph; - - delete graph; - - // Load the supports - if (!string(support_caller.support_file_name).empty()) { - ifstream support_file(support_caller.support_file_name); - if (!support_file) { - cerr << "[vg call]: Unable to load supports file: " - << string(support_caller.support_file_name) << endl; - return 1; - } - augmented_graph.load_supports(support_file); - } else { - assert(!string(support_caller.pack_file_name).empty()); - if (string(support_caller.xg_file_name).empty()) { - cerr << "[vg call]: pack support (-P) requires xg index (-x)" << endl; - return 1; - } - unique_ptr xgidx = vg::io::VPKG::load_one(support_caller.xg_file_name); - augmented_graph.load_pack_as_supports(support_caller.pack_file_name, xgidx.get()); - // make sure we're ignoring quality, as it's not read from the pack - bool& usc = support_caller.use_support_count; - usc = true; - } - - // Load the translations - if (!translation_file_name.empty()) { - ifstream translation_file(translation_file_name.c_str()); - if (!translation_file) { - cerr << "[vg call]: Unable to load translations file: " << translation_file_name << endl; - return 1; - } - augmented_graph.load_translations(translation_file); - } - - // Load or compute the snarls - unique_ptr snarl_manager; - if (!snarl_file_name.empty()) { - ifstream snarl_file(snarl_file_name.c_str()); - if (!snarl_file) { - cerr << "[vg call]: Unable to load snarls file: " << snarl_file_name << endl; - return 1; - } - snarl_manager = vg::io::VPKG::load_one(snarl_file); - } else { - CactusSnarlFinder finder(augmented_graph.graph); - snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls()))); - } - - - if (show_progress) { - cerr << "Calling variants with support caller" << endl; - } - - // project the augmented graph to a reference path - // in order to create a VCF of calls. - support_caller.call(augmented_graph, *snarl_manager.get(), {}); - - delete base_graph; - return 0; -} - -// Register subcommand -static Subcommand vg_call_old("call-old", "call variants on an augmented graph [deprecated, soon to be removed]", DEVELOPMENT, main_call_old); - diff --git a/src/support_caller.cpp b/src/support_caller.cpp deleted file mode 100644 index 2106e872c68..00000000000 --- a/src/support_caller.cpp +++ /dev/null @@ -1,2593 +0,0 @@ -// Call variants using an augmented graphs with annotated supports - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "vg.hpp" -#include "index.hpp" -#include "Variant.h" -#include "genotypekit.hpp" -#include "snarls.hpp" -#include "path.hpp" -#include "path_index.hpp" -#include "support_caller.hpp" -#include -#include "nested_traversal_finder.hpp" - -//#define debug - -namespace vg { - -// How many bases may we put in an allele in VCF if we expect GATK to be able to -// parse it? -// 0 means no maximum is enforced. -const static int MAX_ALLELE_LENGTH = 0; - -// Minimum log likelihood -const static double LOG_ZERO = (double)-1e100; - -// convert to string using stringstream (to replace to_string when we want sci. notation) -template -string to_string_ss(T val) { - stringstream ss; - ss << val; - return ss.str(); -} - -/** - * We need to suppress overlapping variants, but interval trees are hard to - * write. This accomplishes the collision check with a massive bit vector. - */ -struct IntervalBitfield { - // Mark every position that's used in a variant - vector used; - - /** - * Make a new IntervalBitfield covering a region of the specified length. - */ - inline IntervalBitfield(size_t length) : used(length) { - // Nothing to do - } - - /** - * Scan for a collision (O(n) in interval length) - */ - inline bool collides(size_t start, size_t pastEnd) { - for(size_t i = start; i < pastEnd; i++) { - if(used[i]) { - return true; - } - } - return(false); - } - - /** - * Take up an interval. - */ - inline void add(size_t start, size_t pastEnd) { - for(size_t i = start; i < pastEnd; i++) { - used[i] = true; - } - } -}; - -/** - * Get the strand bias of a Support. - */ -double strand_bias(const Support& support) { - return max(support.forward(), support.reverse()) / (support.forward() + support.reverse()); -} - -/** - * Make a letter into a full string because apparently that's too fancy for the - * standard library. - */ -string char_to_string(const char& letter) { - string toReturn; - toReturn.push_back(letter); - return toReturn; -} - -/** - * Write a minimal VCF header for a file with the given samples, and the given - * contigs with the given lengths. - */ -void write_vcf_header(ostream& stream, const vector& sample_names, - const vector& contig_names, const vector& contig_sizes, - int min_mad_for_filter, int max_dp_for_filter, double max_dp_multiple_for_filter, - double max_local_dp_multiple_for_filter, double min_ad_log_likelihood_for_filter, - bool xref_enabled) { - - stream << "##fileformat=VCFv4.2" << endl; - stream << "##ALT=" << endl; - if (xref_enabled) { - stream << "##INFO=" << endl; - } - stream << "##INFO=" << endl; - stream << "##INFO=" << endl; - stream << "##FILTER=" <" <" <" <" <" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - // We need this field to stratify on for VCF comparison. The info is in SB but vcfeval can't pull it out - stream << "##FORMAT=" << endl; - stream << "##FORMAT=" << endl; - - for(size_t i = 0; i < contig_names.size(); i++) { - // Announce the contigs as well. - stream << "##contig=" << endl; - } - - // Now the column header line - stream << "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT"; - for (auto& sample_name : sample_names) { - // Append columns for all the samples - stream << "\t" << sample_name; - } - // End the column header line - stream << endl; -} - -/** - * Return true if a variant may be output, or false if this variant is valid but - * the GATK might choke on it. - * - * Mostly used to throw out variants with very long alleles, because GATK has an - * allele length limit. How alleles that really *are* 1 megabase deletions are - * to be specified to GATK is left as an exercise to the reader. - */ -bool can_write_alleles(vcflib::Variant& variant) { - - for(auto& allele : variant.alleles) { - if(MAX_ALLELE_LENGTH > 0 && allele.size() > MAX_ALLELE_LENGTH) { - return false; - } - } - return true; -} - - -/** - * Given a collection of pileups by original node ID, and a set of original node - * id:offset cross-references in both ref and alt categories, produce a VCF - * comment line giving the pileup for each of those positions on those nodes. - * Includes a trailing newline if nonempty. - * - * TODO: VCF comments aren't really a thing. - */ -string get_pileup_line(const map& node_pileups, - const set>& refCrossreferences, - const set>& altCrossreferences) { - // We'll make a stringstream to write to. - stringstream out; - - out << "#"; - - for(const auto& xref : refCrossreferences) { - // For every cross-reference - if(node_pileups.count(xref.first) && node_pileups.at(xref.first).base_pileup_size() > xref.second) { - // If we have that base pileup, grab it - auto basePileup = node_pileups.at(xref.first).base_pileup(xref.second); - - out << xref.first << ":" << xref.second << " (ref) " << basePileup.bases() << "\t"; - } - // Nodes with no pileups (either no pileups were provided or they didn't - // appear/weren't visited by reads) will not be mentioned on this line - } - - for(const auto& xref : altCrossreferences) { - // For every cross-reference - if(node_pileups.count(xref.first) && node_pileups.at(xref.first).base_pileup_size() > xref.second) { - // If we have that base pileup, grab it - auto basePileup = node_pileups.at(xref.first).base_pileup(xref.second); - - out << xref.first << ":" << xref.second << " (alt) " << basePileup.bases() << "\t"; - } - // Nodes with no pileups (either no pileups were provided or they didn't - // appear/weren't visited by reads) will not be mentioned on this line - } - // TODO: make these nearly-identical loops a loop or a lambda or something. - - if(out.str().size() > 1) { - // We actually found something. Send it out with a trailing newline - out << endl; - return out.str(); - } else { - // Give an empty string. - return ""; - } -} - -SupportCaller::PrimaryPath::PrimaryPath(SupportAugmentedGraph& augmented, const string& ref_path_name, size_t ref_bin_size): - ref_bin_size(ref_bin_size), index(augmented.graph, ref_path_name, true), name(ref_path_name) { - - // Follow the reference path and extract indexes we need: index by node ID, - // index by node start, and the reconstructed path sequence. - PathIndex index(augmented.graph, ref_path_name, true); - - if (index.sequence.size() == 0) { - // No empty reference paths allowed - throw runtime_error("Reference path cannot be empty"); - } - - if (index.by_id.size() != index.by_start.size()) { - // Catch as soon as possible the case where the path being called against is cyclic. - // We don't know how to deal with that so we abort. - cerr << "error[SupportCaller::PrimaryPath]: Some nodes occur more than once along the path " - << ref_path_name << " being called against. Calling against a cyclic path is not well-defined. " - << "Calling will now be aborted. " - << "See https://github.com/vgteam/vg/issues/1946 for more information and potential workarounds." << endl;\ - exit(1); - } - - // Store support binned along reference path; - // Last bin extended to include remainder - ref_bin_size = min(ref_bin_size, index.sequence.size()); - if (ref_bin_size <= 0) { - // No zero-sized bins allowed - throw runtime_error("Reference bin size must be 1 or larger"); - } - // Start out all the bins empty. - binned_support = vector(max(1, int(index.sequence.size() / ref_bin_size)), Support()); - - // Crunch the numbers on the reference and its read support. How much read - // support in total (node length * aligned reads) does the primary path get? - total_support = Support(); - for(auto& pointerAndSupport : augmented.node_supports) { - if(index.by_id.count(pointerAndSupport.first)) { - // This is a primary path node. Add in the total read bases supporting it - total_support += augmented.graph.get_length(augmented.graph.get_handle(pointerAndSupport.first)) * pointerAndSupport.second; - - // We also update the total for the appropriate bin - size_t bin = index.by_id[pointerAndSupport.first].first / ref_bin_size; - if (bin == binned_support.size()) { - --bin; - } - binned_support[bin] = binned_support[bin] + - augmented.graph.get_length(augmented.graph.get_handle(pointerAndSupport.first)) * pointerAndSupport.second; - } - } - - // Average out the support bins too (in place) - min_bin = 0; - max_bin = 0; - for (int i = 0; i < binned_support.size(); ++i) { - // Compute the average over the bin's actual size - binned_support[i] = binned_support[i] / ( - i < binned_support.size() - 1 ? (double)ref_bin_size : - (double)(ref_bin_size + index.sequence.size() % ref_bin_size)); - - // See if it's a min or max - if (binned_support[i] < binned_support[min_bin]) { - min_bin = i; - } - if (binned_support[i] > binned_support[max_bin]) { - max_bin = i; - } - } - -} - -const Support& SupportCaller::PrimaryPath::get_support_at(size_t primary_path_offset) const { - return get_bin(get_bin_index(primary_path_offset)); -} - -size_t SupportCaller::PrimaryPath::get_bin_index(size_t primary_path_offset) const { - // Find which coordinate bin the position is in - size_t bin = primary_path_offset / ref_bin_size; - if (bin == get_total_bins()) { - --bin; - } - return bin; -} - -size_t SupportCaller::PrimaryPath::get_min_bin() const { - return min_bin; -} - -size_t SupportCaller::PrimaryPath::get_max_bin() const { - return max_bin; -} - -const Support& SupportCaller::PrimaryPath::get_bin(size_t bin) const { - return binned_support[bin]; -} - -size_t SupportCaller::PrimaryPath::get_total_bins() const { - return binned_support.size(); -} - -Support SupportCaller::PrimaryPath::get_average_support() const { - return get_total_support() / get_index().sequence.size(); -} - -Support SupportCaller::PrimaryPath::get_average_support(const map& paths) { - // Track the total support overall - Support total; - // And the total number of bases - size_t bases; - - for (auto& kv : paths) { - // Sum over all paths - total += kv.second.get_total_support(); - bases += kv.second.get_index().sequence.size(); - } - - // Then divide - return total / bases; -} - -Support SupportCaller::PrimaryPath::get_total_support() const { - return total_support; -} - -PathIndex& SupportCaller::PrimaryPath::get_index() { - return index; -} - -const PathIndex& SupportCaller::PrimaryPath::get_index() const { - return index; -} - -const string& SupportCaller::PrimaryPath::get_name() const { - return name; -} - -map::iterator SupportCaller::find_path(const Snarl& site) { - for(auto i = primary_paths.begin(); i != primary_paths.end(); ++i) { - // Scan the whole map with an iterator - - if (i->second.get_index().by_id.count(site.start().node_id()) && - i->second.get_index().by_id.count(site.end().node_id())) { - // This path threads through this site - return i; - } - } - // Otherwise we hit the end and found no path that this site can be strung - // on. - return primary_paths.end(); -} - -size_t SupportCaller::get_deletion_length(const NodeSide& end1, const NodeSide& end2, - SupportAugmentedGraph& augmented) { - - for(auto i = primary_paths.begin(); i != primary_paths.end(); ++i) { - // Scan the whole map with an iterator - - map>& idx = i->second.get_index().by_id; - map>::iterator idx_it1 = idx.find(end1.node); - map>::iterator idx_it2 = idx_it1 != idx.end() ? idx.find(end2.node) : idx.end(); - - if (idx_it1 != idx.end() && idx_it2 != idx.end()) { - - // find the positions of our nodesides in the path. - size_t pos1 = idx_it1->second.first; - if (end1.is_end != idx_it1->second.second) { - pos1 += augmented.graph.get_length(augmented.graph.get_handle(idx_it1->first)); - } - size_t pos2 = idx_it2->second.first; - if (end2.is_end != idx_it2->second.second) { - pos2 += augmented.graph.get_length(augmented.graph.get_handle(idx_it2->first)); - } - - if (pos1 > pos2) { - std::swap(pos1, pos2); - } - return pos2 - pos1; - } - } - - return 0; -} - -/** - * Trace out the given traversal, handling nodes, child snarls, and edges - * associated with particular visit numbers. - */ -static void trace_traversal(const SnarlTraversal& traversal, const Snarl* site, - function handle_node, - function handle_edge, - function handle_child) { - - // Must at least have start and end - assert(site == nullptr || traversal.visit_size() >= 2); - - // If we're given a site, we assume we have snarl endpoints that we want to skip - // Otherise, we handle the endpoints. This is toggled with the three variables below. - int first_i = site != nullptr ? 1 : 0; - int last_i = site != nullptr ? traversal.visit_size() - 1 : traversal.visit_size(); - int idx_offset = site != nullptr ? -1 : 0; - - // Look at the edge leading from the start (also handles deletion traversals) - if (site != nullptr || traversal.visit_size() > 1) { - handle_edge(0, to_right_side(traversal.visit(0)), to_left_side(traversal.visit(1))); - } - - for(int64_t i = first_i; i < last_i; i++) { - // For all the (internal) visits... - auto& visit = traversal.visit(i); - - if (visit.node_id() != 0) { - // This is a visit to a node - - // Find the node - handle_node(i + idx_offset, visit.node_id(), visit.backward()); - } else { - // This is a snarl - handle_child(i + idx_offset, visit.snarl(), visit.backward()); - } - - if (i < traversal.visit_size() - 1) { - auto& next_visit = traversal.visit(i + 1); - - if (visit.node_id() == 0 && next_visit.node_id() == 0 && - to_right_side(visit).flip() == to_left_side(next_visit)) { - - // These are two back-to-back child snarl visits, which - // share a node and have no connecting edge. -#ifdef debug - cerr << "No edge needed for back-to-back child snarls" << endl; -#endif - - } - else { - // Do the edge to it - handle_edge(i + idx_offset, to_right_side(visit), to_left_side(next_visit)); - } - } - } -} - -/** - * Get the min support, total support, bp size (to divide total by for average - * support), and fraction of unsupported edges for a traversal, optionally special-casing the - * material used by another traversal. Material used by another traversal only - * makes half its coverage available to this traversal. - */ -tuple SupportCaller::get_traversal_support(SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, const Snarl* site, const SnarlTraversal& traversal, - const vector& already_used, const SnarlTraversal* ref_traversal) { - -#ifdef debug - cerr << "Evaluate traversal: " << endl; - for (size_t i = 0; i < traversal.visit_size(); i++) { - cerr << "\t" << pb2json(traversal.visit(i)) << endl; - } - if (!already_used.empty()) { - for (auto share_trav : already_used) { - cerr << "Need to share: " << endl; - for (size_t i = 0; i < share_trav->visit_size(); i++) { - cerr << "\t" << pb2json(share_trav->visit(i)) << endl; - } - } - } -#endif - - // First work out the stuff we need to share - multiset shared_nodes; - multiset shared_children; - multiset shared_edges; - for (auto shared_trav : already_used) { - // Mark all the nodes and edges that the other traverasl uses. - trace_traversal(*shared_trav, site, [&](size_t i, id_t node, bool is_reverse) { - shared_nodes.insert(node); - }, [&](size_t i, NodeSide end1, NodeSide end2) { - shared_edges.insert(augmented.graph.get_edge(end1, end2)); - }, [&](size_t i, Snarl child, bool is_reverse) { - shared_children.insert(child); - }); - } - - // And the reference stuff we want to average out separately - set ref_nodes; - set ref_children; - set ref_edges; - if (ref_traversal != nullptr && ref_traversal != &traversal) { - // Mark all the nodes and edges that the ref traverasl uses. - trace_traversal(*ref_traversal, site, [&](size_t i, id_t node, bool is_reverse) { - ref_nodes.insert(node); - }, [&](size_t i, NodeSide end1, NodeSide end2) { - ref_edges.insert(augmented.graph.get_edge(end1, end2)); - ref_nodes.insert(end1.node); - ref_nodes.insert(end2.node); - }, [&](size_t i, Snarl child, bool is_reverse) { - ref_children.insert(child); - }); - } - - // Compute min and total supports, and bp sizes, for all the visits by - // number. If we have a site, we subtract two here as trace_traversal skips the endpoints - size_t record_count = max(1, site != nullptr ? traversal.visit_size() - 2 : traversal.visit_size()); - // What's the min support observed at every visit (inclusing edges)? - // Support is on the forward and reverse strand relative to the visits. - vector min_supports(record_count, make_support(INFINITY, INFINITY, INFINITY)); - // And the total support (ignoring edges)? - // Support is on the forward and reverse strand relative to the visits. - vector total_supports(record_count, Support()); - // And the bp size of each visit - vector visit_sizes(record_count, 0); - // Keep reference-overlapping support separate - vector total_ref_supports(record_count, Support()); - vector ref_visit_sizes(record_count, 0); - // Keep track of which direction we're going on the reference - // (set to true if we go through a reversing edge) - bool ref_reversed = false; - // apply max_unsupported_edge_size cutoff (todo: less hacky) - bool zero_avg_support = false; - - // Don't count nodes shared between child snarls more than once. - set coverage_counted; - - trace_traversal(traversal, site, [&](size_t i, id_t node_id, bool is_reverse) { - // Find the node - Node* node = augmented.graph.get_node(node_id); - - // Grab this node's total support along its length - // Make sure to only use half the support if the node is shared or none if its shared twice - double shared_factor = 1. - 0.5 * (double) min(2UL, shared_nodes.count(node_id)); - auto got_support = augmented.get_support(node->id()) * node->sequence().size() * shared_factor; - - if (is_reverse) { - // Put the support relative to the traversal's strand - got_support = flip(got_support); - } - -#ifdef debug - cerr << "From node " << node->id() << " get " << got_support << endl; -#endif - // don't count inverted nodes as reference - bool count_as_ref = ref_nodes.count(node_id) && !ref_reversed; - - if (!count_as_ref) { - // update totals for averaging - total_supports[i] += got_support; - visit_sizes[i] += node->sequence().size(); - } else { - // reference-overlapping support kept separate for later filters - total_ref_supports[i] += got_support; - ref_visit_sizes[i] += node->sequence().size(); - } - - // And update its min support - min_supports[i] = support_min(min_supports[i], augmented.get_support(node->id()) * shared_factor); - - }, [&](size_t i, NodeSide end1, NodeSide end2) { - // This is an edge - Edge* edge = augmented.graph.get_edge(end1, end2); - assert(edge != nullptr); - - // edges between adjacent nodes or ones that go off primary path count for size 1 - // otherwise, they count as the number of deleted bases on the primary path - size_t edge_size = max(1UL, get_deletion_length(end1, end2, augmented)); - - // Count as 1 base worth for the total/average support - // Make sure to only use half the support if the edge is shared - double shared_factor = 1. - 0.5 * (double) min(2UL, shared_edges.count(edge)); - edge_t edge_handle = augmented.graph.edge_handle(augmented.graph.get_handle(end1.node, !end1.is_end), - augmented.graph.get_handle(end2.node, end2.is_end)); - auto got_support = (double)edge_size * augmented.get_support(edge_handle) * shared_factor; - - // Prevent averaging over SVs with 0 support, just because the reference part of the traversal has support - if (edge_size > max_unsupported_edge_size && support_val(got_support) == 0) { - zero_avg_support = true; - } - - if (end1.node > end2.node || (end1.node == end2.node && !end1.is_end)) { - // We follow the edge backward, from high ID to low ID, or backward around a normal self loop. - // TODO: Make sure this check agrees on which orientation is which after augmentation re-numbers things! - // Put the support relative to the traversal's strand - got_support = flip(got_support); - } - -#ifdef debug - cerr << "From edge " << edge->from() << " " << edge->from_start() << " to " - << edge->to() << " " << edge->to_end() << " get " << got_support << endl; -#endif - - if (!ref_edges.count(edge)) { - total_supports[i] += got_support; - visit_sizes[i] += edge_size; - } else { - total_ref_supports[i] += got_support; - ref_visit_sizes[i] += edge_size; - } - - // flip our reversed flag if we hit an inversion edge - // todo: won't detect complex inversions that stray off reference - if (end1.is_end == end2.is_end && ref_nodes.count(end1.node) && ref_nodes.count(end2.node)) { - ref_reversed = !ref_reversed; - } - - // Min in its support - min_supports[i] = support_min(min_supports[i], augmented.get_support(edge_handle) * shared_factor); - }, [&](size_t i, Snarl child, bool is_reverse) { - // This is a child snarl, so get its max support. - - Support child_max; - size_t child_size = 0; - for (id_t node_id : snarl_manager.deep_contents(snarl_manager.manage(child), - augmented.graph, true).first) { - // For every node in the child - Node* node = augmented.graph.get_node(node_id); - - if (coverage_counted.count(node)) { - // Already used by another child snarl on this traversal - continue; - } - // Claim this node for this child. - coverage_counted.insert(node); - - Support child_support = augmented.get_support(node->id()); - - // TODO: We can't tell which strand of a child snarl's contained - // nodes corresponds to which strand of the child snarl. Just - // average across strands. - double average_support = total(child_support) / 2; - child_support.set_forward(average_support); - child_support.set_reverse(average_support); - - // How many distinct reads must use the child, given the distinct reads on this node? - child_max = support_max(child_max, augmented.get_support(node->id())); - - // Add in the node's size to the child - child_size += node->sequence().size(); - -#ifdef debug - cerr << "From child snarl node " << node->id() << " get " - << child_support << " for distinct " << child_max << endl; -#endif - } - - double shared_factor = 1. - 0.5 * (double) min(2UL, shared_children.count(child)); - // Make sure to halve the support if the child is shared - child_max *= shared_factor; - - // don't count inverted children as reference - bool count_as_ref = ref_children.count(child) && !ref_reversed; - - // Smoosh support over the whole child - if (!count_as_ref) { - total_supports[i] += child_max * child_size; - visit_sizes[i] += child_size; - } else { - total_ref_supports[i] += child_max * child_size; - ref_visit_sizes[i] += child_size; - } - - if (child_size != 0) { - // We actually have some nodes to our name. - min_supports[i] = support_min(min_supports[i], child_max); - } - - }); - - // Now aggregate across visits and their edges - - // What's the total support for this traversal? - Support total_support; - for (auto& support : total_supports) { - total_support += support; - } - - // And the length over which we have it (for averaging) - size_t total_size = 0; - for (auto& size : visit_sizes) { - total_size += size; - } - - // If we are looking at a non-ref traversal, we ignore the reference support - // if it's higher than the non-reference support for the purposes of computing - // average support. This is to prevent spurious calls when the actual variant - // has low support, but most of the bases in the traversal are shared with the - // well-supported reference. - if (!ref_nodes.empty() || !ref_edges.empty() || !ref_children.empty()) { - Support total_ref_support; - for (auto& support : total_ref_supports) { - total_ref_support += support; - } - size_t total_ref_size = 0; - for (auto& size : ref_visit_sizes) { - total_ref_size += size; - } - if (support_val(total_ref_support) / total_ref_size <= - support_val(total_support) / total_size) { - total_support += total_ref_support; - total_size += total_ref_size; - } - } - - // Apply the max_unsupported_edge_size cutoff - if (zero_avg_support) { - total_support = Support(); - } - - // And the min support? - Support min_support = make_support(INFINITY, INFINITY, INFINITY); - for (auto& support : min_supports) { - min_support = support_min(min_support, support); - } - - if (min_support.forward() == INFINITY || min_support.reverse() == INFINITY) { - // If we have actually no material, say we have actually no support - min_support = Support(); - } - - // Spit out the supports, the size in bases observed. - return tie(min_support, total_support, total_size); - -} - -/** Get the support for each traversal in a list, using average_support_switch_threshold - to decide if we use the minimum or average. Apply the min_supported_edges cutoff - to set support to 0 if not enough edges were supported. -*/ -tuple, vector > SupportCaller::get_traversal_supports_and_sizes( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, const vector& minus_traversals) { - - // How long is the longest traversal? - // Sort of approximate because of the way nested site sizes are estimated. - size_t longest_traversal_length = 0; - - // And the shortest one? - size_t shortest_traversal_length = numeric_limits::max(); - - // Calculate average and min support for all the traversals of this snarl. - vector min_supports; - vector average_supports; - vector sizes; - for(auto& traversal : traversals) { - // Go through all the SnarlTraversals for this Snarl - - // What's the total support for this traversal? - Support total_support; - // And the length over which we have it (for averaging) - size_t total_size; - // And the min support? - Support min_support; - // Trace the traversal and get its support - tie(min_support, total_support, total_size) = get_traversal_support( - augmented, snarl_manager, &site, traversal, minus_traversals, &traversals.front()); - - // Add average and min supports to vectors. Note that average support - // ignores edges. - min_supports.push_back(min_support); - average_supports.push_back(total_size != 0 ? total_support / total_size : Support()); - -#ifdef debug - cerr << "Min: " << min_support << " Total: " << total_support << " Average: " << average_supports.back() << endl; -#endif - - // Remember a new longest traversal length - longest_traversal_length = max(longest_traversal_length, total_size); - // And a new shortest one - shortest_traversal_length = min(shortest_traversal_length, total_size); - // and the current size - sizes.push_back(total_size); - } - -#ifdef debug - cerr << "Min vs. average" << endl; -#endif - for (size_t i = 0; i < average_supports.size(); i++) { -#ifdef debug - cerr << "\t" << min_supports.at(i) << " vs. " << average_supports.at(i) << endl; -#endif - } - - return (longest_traversal_length > average_support_switch_threshold || use_average_support) ? - tie(average_supports, sizes) : - tie(min_supports, sizes); -} - -vector SupportCaller::get_inversion_supports( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, const vector& traversal_sizes, - int best_allele, int second_best_allele) { - vector inversion_supports; - int trav_size = -1; - - // ATM we only use this for distinguishing between 0/1 and 1/1 inversions - if (best_allele != -1 && second_best_allele != -1 && (best_allele == 0 || second_best_allele == 0) && - traversals[best_allele].visit_size() == traversals[second_best_allele].visit_size() && - traversals[best_allele].visit_size() > 2) { - trav_size = traversals[best_allele].visit_size(); - - // Make sure we are dealing with a simple inversion, where one traversal is the same as the other - // but in the opposite direction (ignoring endpoints) - for (int i = 1; i < trav_size - 1; ++i) { - const Visit& bvis = traversals[best_allele].visit(i); - const Visit& svis = traversals[second_best_allele].visit(trav_size - 1 - i); - - // must both be snarls or same nodes - if (bvis.node_id() != svis.node_id() || - // must be in reverse orientation relative to each other - bvis.backward() == svis.backward() || - // if both snarls - (bvis.node_id() == 0 && svis.node_id() == 0 && - // then they must have same endpoints - (bvis.snarl().start() != svis.snarl().start() || - bvis.snarl().end() != svis.snarl().end()))) { - return inversion_supports; - } - } - - size_t longest_traversal_length = *max_element(traversal_sizes.begin(), traversal_sizes.end()); - - // compute the supports, keep them in same sized array as normal supports to be less confusing - inversion_supports.resize(traversals.size()); - for (auto allele : {best_allele, second_best_allele}) { - - // the edge going out of the site's start - const Visit& next_visit = traversals[allele].visit(1).node_id() ? - traversals[allele].visit(1) : - traversals[allele].visit(1).backward() ? - traversals[allele].visit(1).snarl().end() : - traversals[allele].visit(1).snarl().start(); - bool next_backward = traversals[allele].visit(1).node_id() ? - traversals[allele].visit(1).backward() : - next_visit.backward() != traversals[allele].visit(1).backward(); - Edge* edge1 = augmented.graph.get_edge(NodeSide(traversals[allele].visit(0).node_id(), - !traversals[allele].visit(0).backward()), - NodeSide(next_visit.node_id(), next_backward)); - - // the edge going into the site's end - const Visit& prev_visit = traversals[allele].visit(trav_size - 2).node_id() ? - traversals[allele].visit(trav_size - 2) : - traversals[allele].visit(trav_size - 2).backward() ? - traversals[allele].visit(trav_size - 2).snarl().start() : - traversals[allele].visit(trav_size - 2).snarl().end(); - bool prev_backward = traversals[allele].visit(trav_size - 2).node_id() ? - traversals[allele].visit(trav_size - 2).backward() : - prev_visit.backward() != traversals[allele].visit(trav_size - 2).backward(); - Edge* edge2 = augmented.graph.get_edge(NodeSide(prev_visit.node_id(), !prev_backward), - NodeSide(traversals[allele].visit(trav_size - 1).node_id(), - traversals[allele].visit(trav_size - 1).backward())); - - assert(edge1 != nullptr && edge2 != nullptr); - - edge_t edge_handle1 = augmented.graph.edge_handle(augmented.graph.get_handle(edge1->from(), edge1->from_start()), - augmented.graph.get_handle(edge1->to(), edge2->to_end())); - edge_t edge_handle2 = augmented.graph.edge_handle(augmented.graph.get_handle(edge2->from(), edge2->from_start()), - augmented.graph.get_handle(edge2->to(), edge2->to_end())); - - if (longest_traversal_length > average_support_switch_threshold || use_average_support) { - inversion_supports[allele] = (augmented.get_support(edge_handle1) + augmented.get_support(edge_handle2) / 2); - } else { - inversion_supports[allele] = min(augmented.get_support(edge_handle2), augmented.get_support(edge_handle2)); - } - } - } - - return inversion_supports; -} - - -vector SupportCaller::find_best_traversals( - SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, TraversalFinder* finder, const Snarl& site, - const Support& baseline_support, size_t copy_budget, - function emit_locus) { - - // We need to be a-directed-cyclic and start-end-reachable for the traversal finder to work right. - assert(site.start_end_reachable()); - assert(site.directed_acyclic_net_graph()); - - -#ifdef debug - cerr << "Site " << site << endl; -#endif - - - vector> here_alleles; - vector site_variants; - vector here_traversals; - - // Get traversals of this Snarl, with Visits to child Snarls. - // The 0th is always the reference traversal if we're on a primary path - // If we have a VCF to genotype, we also associate each traversal with a haplotype from the vcf. - if (!((string)recall_vcf_filename).empty()) { - pair>>, vector> allele_travs = - dynamic_cast(finder)->find_allele_traversals(site); - // todo: harmonize interface - site_variants = allele_travs.second; - for (auto& ta : allele_travs.first) { - here_traversals.push_back(ta.first); - here_alleles.push_back(ta.second); - } - // there can be some trvial snarls here that don't match variants. - if (here_traversals.empty()) { - return here_traversals; - } - } else { - here_traversals = finder->find_traversals(site); - } - - -#ifdef debug - cerr << "Found " << here_traversals.size() << " traversals" << endl; -#endif - - - // Make a Locus to hold all our stats for the different traversals - // available. The 0th will always be the ref traversal if we're on a primary - // path. - Locus locus; - - vector supports; - vector traversal_sizes; - // Calculate the support for all the traversals of this snarl. - tie(supports, traversal_sizes) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals); - - //////////////////////////////////////////////////////////////////////////// - - // look at all the paths for the site and pick the best one - function&, vector)> get_best_allele = [this]( - const vector& supports, vector skips) { - int best_allele = -1; - for(size_t i = 0; i < supports.size(); i++) { - if(std::find(skips.begin(), skips.end(), i) == skips.end() && ( - best_allele == -1 || support_val(supports[best_allele]) <= support_val(supports[i]))) { - best_allele = i; - } - } - return best_allele; - }; - - // Now look at all the paths for the site and pick the best one - int best_allele = get_best_allele(supports, {}); - - // We should always have a best allele; we may sometimes have a second best. - assert(best_allele != -1); - -#ifdef debug - cerr << "Choose best allele: " << best_allele << endl; -#endif - - // Then recalculate supports assuming we can't count anything shared with that best traversal - vector additional_supports; - tie(additional_supports, std::ignore) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals, {&here_traversals.at(best_allele)}); - - // Then pick the second best one - int second_best_allele = get_best_allele(additional_supports, {best_allele}); - - // Problem: when using average support (necessary for long variants, especially - // in recall (no augmentation) mode), it's almost impossible to call a 1/1 inversion. - // This is because the nodes on the reference and inversion have the same support. - // The difference is in the edge supports at the snarl endpoints, but in general - // we need to allow for a few unsupported edges here and there to call indels. - // So this is a hack to identify simple inversions in order to restrict relative support - // calculations to the edges that differ between them in order to genotype correctly. - // - // Not sure what the best general solution to this here, but it's something to keep - // in mind for the next generation of the caller. Factoring minimum support on the augmented - // graph would fix this in theory, but has yet to be competitive overall on benchmark data. - vector inversion_supports = get_inversion_supports( - augmented, snarl_manager, site, here_traversals, traversal_sizes, best_allele, second_best_allele); - -#ifdef debug - cerr << "Choose second best allele: " << second_best_allele << endl; -#endif - - // Hack for special case where we want to call a multiallelic alt even if the reference - // has better support than one or both alts - vector tertiary_supports; - int third_best_allele = -1; - if (second_best_allele != -1) { - tie(tertiary_supports, std::ignore) = get_traversal_supports_and_sizes( - augmented, snarl_manager, site, here_traversals, {&here_traversals.at(second_best_allele)}); - third_best_allele = get_best_allele(tertiary_supports, {best_allele, second_best_allele}); - } - - // Decide if we're an indel by looking at the traversal sizes - bool is_indel = traversal_sizes[best_allele] != traversal_sizes[0] || - (second_best_allele != -1 && traversal_sizes[0] != traversal_sizes[second_best_allele]); - bool is_indel_ma_2 = (second_best_allele != -1 && traversal_sizes[0] != traversal_sizes[second_best_allele]); - bool is_indel_ma_3 = (third_best_allele != -1 && traversal_sizes[0] != traversal_sizes[third_best_allele]); - - //////////////////////////////////////////////////////////////////////////// - - // Now make a genotype call at this site, up to the allowed copy number - - // TODO: Work out how to detect indels when there are nested sites and - // enable the indel bias multiple again. - double bias_multiple = 1.0; - - // How much support do we have for the top two alleles? - Support site_support = supports.at(best_allele); - if(second_best_allele != -1) { - site_support += supports.at(second_best_allele); - } - - // Pull out the different supports. Some of them may be the same. - Support best_support = supports.at(best_allele); - Support second_best_support; // Defaults to 0 - if(second_best_allele != -1) { - second_best_support = supports.at(second_best_allele); - } - Support third_best_support; - if (third_best_allele != -1) { - third_best_support = supports.at(third_best_allele); - } - - // Override inversion supports for sake of genotyping - Support best_support_gt = best_support; - Support second_best_support_gt = second_best_support; - if (!inversion_supports.empty()) { - best_support_gt = inversion_supports.at(best_allele); - second_best_support_gt = inversion_supports.at(second_best_allele); -#ifdef debug - cerr << "Overriding support for simple inversion for purposes of genotyping" << endl - << "allele " << best_allele << " from " << best_support << " to " << best_support_gt << endl - << "allele " << second_best_allele << " from " << second_best_support << " to " << second_best_support_gt << endl; -#endif - if (support_val(best_support_gt) < support_val(second_best_support_gt)) { - std::swap(best_allele, second_best_allele); - std::swap(best_support, second_best_support); - std::swap(best_support_gt, second_best_support_gt); -#ifdef debug - cerr << "Swapping best and second best allele in light of inversion support override" << endl; -#endif - } - } - - // As we do the genotype, we also compute the likelihood. Holds - // likelihood log 10. Starts out at "completely wrong". - double gen_likelihood = -1 * INFINITY; - - // Minimum allele depth of called alleles - double min_site_support = 0; - - // This is where we'll put the genotype. We only actually add it to the - // Locus if we are confident enough to actually call. - Genotype genotype; - - // We're going to make some really bad calls at low depth. We can - // pull them out with a depth filter, but for now just elide them. - if (support_val(site_support) >= support_val(baseline_support) * min_fraction_for_call * ((double) copy_budget) / 2) { - // We have enough to emit a call here. - - // If best and second best are close enough to be het, we call het. - // Otherwise, we call hom best. - - double bias_limit; - if (best_allele == 0) { - // Use ref bias limit - - // We decide closeness differently depending on whether best is ref - // or not. In practice, we use this to slightly penalize homozygous - // ref calls (by setting max_ref_het_bias higher than max_het_bias) - // and rather make a less supported alt call instead. This boost - // max sensitivity, and because everything is homozygous ref by - // default in VCF, any downstream filters will effectively reset - // these calls back to homozygous ref. TODO: This shouldn't apply - // when off the primary path! - bias_limit = max_ref_het_bias; - } else if (is_indel) { - // This is an indel - // Use indel bias limit - bias_limit = max_indel_het_bias; - } else { - // Use normal het bias limit - bias_limit = max_het_bias; - } - -#ifdef debug - cerr << best_allele << ", " << best_support << " and " - << second_best_allele << ", " << second_best_support << endl; - - if (support_val(second_best_support) > 0) { - cerr << "Bias: (limit " << bias_limit * bias_multiple << "):" - << support_val(best_support_gt)/support_val(second_best_support_gt) << endl; - } - - cerr << bias_limit * bias_multiple * support_val(second_best_support_gt) << " vs " - << support_val(best_support_gt) << endl; - - cerr << total(second_best_support) << " vs " << min_total_support_for_call << endl; -#endif - - // Call 1/2 : REF-Alt1/Alt2 even if Alt2 has only third best support - if (copy_budget >= 2 && - best_allele == 0 && - third_best_allele > 0 && - is_indel_ma_3 && - max_indel_ma_bias * bias_multiple * support_val(third_best_support) >= support_val(best_support) && - total(second_best_support) > min_total_support_for_call && - total(third_best_support) > min_total_support_for_call) { - // There's a second best allele and third best allele, and it's not too biased to call, - // and both alleles exceed the minimum to call them present, and the - // second-best and third-best alleles have enough support that it won't torpedo the - // variant. - -#ifdef debug - cerr << "Call as second best/third best" << endl; -#endif - // Say both are present - genotype.add_allele(second_best_allele); - genotype.add_allele(third_best_allele); - - // Get minimum support for filter (not assuming it's second_best just to be sure) - min_site_support = min(total(second_best_support), total(third_best_support)); - - // Make the call - *locus.add_genotype() = genotype; - } - else if (copy_budget >= 2 && - second_best_allele != -1 && - bias_limit * bias_multiple * support_val(second_best_support_gt) >= support_val(best_support_gt) && - total(best_support) > min_total_support_for_call && - total(second_best_support) > min_total_support_for_call) { - // There's a second best allele, and it's not too biased to call, - // and both alleles exceed the minimum to call them present, and the - // second-best allele has enough support that it won't torpedo the - // variant. - -#ifdef debug - cerr << "Call as best/second best" << endl; -#endif - - // Say both are present - genotype.add_allele(best_allele); - genotype.add_allele(second_best_allele); - - // Get minimum support for filter (not assuming it's second_best just to be sure) - min_site_support = min(total(second_best_support), total(best_support)); - - // Make the call - *locus.add_genotype() = genotype; - - } else if (copy_budget >= 2 && total(best_support) > min_total_support_for_call) { - // The second best allele isn't present or isn't good enough, - // but the best allele has enough coverage that we can just call - // two of it. - -#ifdef debug - cerr << "Call as best/best" << endl; -#endif - - // Say the best is present twice - genotype.add_allele(best_allele); - genotype.add_allele(best_allele); - - // Get minimum support for filter - min_site_support = total(best_support); - - // Make the call - *locus.add_genotype() = genotype; - - } else if (copy_budget >= 1 && total(best_support) > min_total_support_for_call) { - // We're only supposed to have one copy, and the best allele is good enough to call - -#ifdef debug - cerr << "Call as best" << endl; -#endif - - // Say the best is present once - genotype.add_allele(best_allele); - - // Get minimum support for filter - min_site_support = total(best_support); - - // Make the call - *locus.add_genotype() = genotype; - } else { - // Either coverage is too low, or we aren't allowed any copies. - // We can't really call this as anything. - -#ifdef debug - cerr << "Do not call" << endl; -#endif - - // Don't add the genotype to the locus - assert(locus.genotype_size() == 0); - } - } else { - // Depth too low. Say we have no idea. - // TODO: elide variant? - - // Don't add the genotype to the locus - } - - // Add the supports to the locus, correcting for shared support vis a vis the called alleles - for (int i = 0; i < supports.size(); ++i) { - vector subtract_travs; - if (locus.genotype_size() > 0 && locus.genotype(0).allele_size() > 0 && i != locus.genotype(0).allele(0)) { - subtract_travs.push_back(&here_traversals[locus.genotype(0).allele(0)]); - } - if (locus.genotype_size() > 0 && locus.genotype(0).allele_size() > 1 && i != locus.genotype(0).allele(1)) { - subtract_travs.push_back(&here_traversals[locus.genotype(0).allele(1)]); - } - if (subtract_travs.empty()) { - // Blit supports over to the locus - *locus.add_support() = supports[i]; - } else { - // Subtract the supports from the called alleles - vector corrected_support; - tie(corrected_support, std::ignore) = - get_traversal_supports_and_sizes(augmented, snarl_manager, site, {here_traversals[i]}, subtract_travs); - *locus.add_support() = corrected_support[0]; - } - } - assert(locus.support_size() == here_traversals.size()); - - // Find the total support for the Locus across all alleles - Support locus_support; - for (auto& s : supports) { - // Sum up all the Supports form all alleles (even the non-best/second-best). - locus_support += s; - } - // Save support - *locus.mutable_overall_support() = locus_support; - - //////////////////////////////////////////////////////////////////////////// - - // Figure out what child snarls are touched by the paths we have called and - // how much copy number each should get. - map child_usage_counts; - for (size_t i = 0; i < genotype.allele_size(); i++) { - // For each copy we call as present, find the SnarlTraversal we're - // asserting - SnarlTraversal& traversal = here_traversals.at(genotype.allele(i)); - - for (size_t j = 1; j < traversal.visit_size() - 1; j++) { - // For each visit to a child snarl - auto& visit = traversal.visit(j); - if (visit.node_id() != 0) { - continue; - } - - // Find the child snarl pointer for the snarl we visit - const Snarl* child = snarl_manager.manage(visit.snarl()); - - // Say it's used one more time - child_usage_counts[child]++; - - } - } - - // Recursion not supported for VCF recall, as the traversal finder only returns - // fully-resolved traversals. - unordered_map> child_traversals; - if (((string)recall_vcf_filename).empty()) { - // Recurse and get traversals for children. We do this for all our children, - // even the ones called as CN 0, because we need the fully-specified - // traversals to build our Locus (which needs the alleles we rejected as - // having no copies). - for (const Snarl* child : snarl_manager.children_of(&site)) { - // Recurse on each child, giving a copy number budget according to the - // usage count call at this site. This produces fully realized - // traversals with no Visits to Snarls. - - // But make sure the child is going to be traversable first. We could - // just skip difficult children but then we'd be weirdly biased away - // from the paths they are on. - assert(child->start_end_reachable()); - assert(child->directed_acyclic_net_graph()); - - // Holds ref traversal, best, and optional second best for each child. - child_traversals[child] = find_best_traversals(augmented, snarl_manager, - finder, *child, baseline_support, child_usage_counts[child], emit_locus); - } - - for (auto kv : child_traversals) { - // All children must have at least two traversals (a ref and a best). - // Off the primary paths, the ref is sort of arbitrary. - assert(kv.second.size() >= 2); - } - } - - // Put the best (or ref) traversal for each child in our traversals that - // visit it (even if that contradicts the calls on the child) - vector concrete_traversals; - for (size_t traversal_number = 0; traversal_number < here_traversals.size(); traversal_number++) { - // For every abstract traversal of this site, starting with the ref traversal... - auto& abstract_traversal = here_traversals[traversal_number]; -#ifdef debug - cerr << "Concretizing abstract traversal " << pb2json(abstract_traversal) << endl; -#endif - - // Make a "concrete", node-level traversal for every abstract, Snarl- - // visiting traversal. - concrete_traversals.emplace_back(); - auto& concrete_traversal = concrete_traversals.back(); - - for (size_t i = 0; i < abstract_traversal.visit_size(); i++) { - // Go through all the visits in the abstract traversal - auto& abstract_visit = abstract_traversal.visit(i); - - if (abstract_visit.node_id() != 0) { - // If they're fully realized, just take them - *concrete_traversal.add_visit() = abstract_visit; - } else { - // If they're visits to children, look up the child - const Snarl* child = snarl_manager.manage(abstract_visit.snarl()); - - // Then blit the child's path over. This will be the ref path if - // we are concrete-izing this snarl's ref traversal, and the - // best path for the child otherwise. Keep in mind that we may - // be going through the child backward. - auto& child_traversal = child_traversals.at(child).at(traversal_number == 0 ? 0 : 1); - -#ifdef debug - cerr << "Splicing in child traversal " << pb2json(child_traversal) << endl; -#endif - - size_t trav_transfer_start = 0; - if (i != 0) { - // There was a previous visit. It may have been a previous - // back-to-back snarl. - auto& last_visit = abstract_traversal.visit(i - 1); - if (last_visit.node_id() == 0 && to_right_side(last_visit).flip() == to_left_side(abstract_visit)) { - // It was indeed a previous back to back site. Don't add the entry node! -#ifdef debug - cerr << "Skip entry node for back-to-back sites" << endl; -#endif - trav_transfer_start++; - } - } - for (size_t j = trav_transfer_start; j < child_traversal.visit_size(); j++) { - // All the internal visits, in the correct order - *concrete_traversal.add_visit() = abstract_visit.backward() ? - reverse(child_traversal.visit(child_traversal.visit_size()- 1 - j)) : - child_traversal.visit(j); - } - } - } -#ifdef debug - cerr << "Finished concrete traversal " << pb2json(concrete_traversals.back()) << endl; -#endif - } - - for (auto& concrete_traversal : concrete_traversals) { - // Populate the Locus with those traversals by converting to paths - Path* converted = locus.add_allele(); - - for (size_t i = 0; i < concrete_traversal.visit_size(); i++) { - // Convert all the visits to Mappings and stick them in the Locus's Paths - *converted->add_mapping() = to_mapping(concrete_traversal.visit(i), augmented.graph); - } - } - - if (!((string)recall_vcf_filename).empty()) { - // we transform our locus into terms of the input vcf. this can split it up - // into several sites. each one will be emitted as usual with emit_locus - recall_locus(locus, site, here_traversals, here_alleles, site_variants, emit_locus); - } - else if (locus.genotype_size() > 0) { - // Emit the locus if we have a call - emit_locus(locus, &site, nullptr); - } - - // Build the list of traversals to return as ref, best, second best, with - // possible repeats. - vector to_return{concrete_traversals[0], concrete_traversals[best_allele]}; - if (second_best_allele != -1) { - to_return.push_back(concrete_traversals[second_best_allele]); - } - - // Return those important traversals - return to_return; - -} - -void SupportCaller::recall_locus(Locus& locus, const Snarl& site, vector& traversals, - vector>& trav_alleles, - vector& site_variants, - function emit_locus) -{ - - for (int var_idx = 0; var_idx < site_variants.size(); ++var_idx) { - // create a locus for this variant - Locus vcf_locus; - Genotype& vcf_genotype = *vcf_locus.add_genotype(); - - // resize support to be able to hold value for each VCF allele - for (int i = 0; i < site_variants[var_idx]->alleles.size(); ++i) { - vcf_locus.add_support(); - } - - // find the best support for every VCF allele, even if those that aren't called - for (int i = 0; i < trav_alleles.size(); ++i) { - int vcf_allele = trav_alleles[i][var_idx]; - *vcf_locus.mutable_support(vcf_allele) = support_max(vcf_locus.support(vcf_allele), - locus.support(i)); - } - - // convert the allele we called from our traversal list into the corresponding - // allele for this variant in the VCF - if (locus.genotype_size() > 0) { - for (int i = 0; i < locus.genotype(0).allele_size(); ++i) { - int called_allele = locus.genotype(0).allele(i); - int vcf_allele = trav_alleles[called_allele][var_idx]; - vcf_genotype.add_allele(vcf_allele); - // make absolutely sure we're using the right support for our called alleles - // the support is in terms of the entire snarl, and not the vcf variant - *vcf_locus.mutable_support(vcf_allele) = locus.support(called_allele); - } - } - - *vcf_locus.mutable_overall_support() = locus.overall_support(); - emit_locus(vcf_locus, &site, site_variants[var_idx]); - } -} - -// This function emits the given variant on the given primary path, as -// VCF. It needs to take the site as an argument because it may be -// called for children of the site we're working on right now. -void SupportCaller::emit_variant(map& contig_names_by_path_name, - vcflib::VariantCallFile& vcf, - SupportAugmentedGraph& augmented, - Support& baseline_support, - Support& global_baseline_support, - const Locus& locus, PrimaryPath& primary_path, const Snarl* site) { - - // Note that the locus paths will traverse our site forward, which - // may make them backward along the primary path. - bool site_backward = (primary_path.get_index().by_id.at(site->start().node_id()).first > - primary_path.get_index().by_id.at(site->end().node_id()).first); - - // Unpack the genotype back into best and second-best allele - auto& genotype = locus.genotype(0); - int best_allele = genotype.allele(0); - // If we called a single allele, we've lost the second-best allele info. But we won't need it, so we can just say -1. - int second_best_allele = (genotype.allele_size() >= 2 && genotype.allele(0) != genotype.allele(1)) ? - genotype.allele(1) : - -1; - - // Populate this with original node IDs, from before augmentation. - set original_nodes; - - // Calculate the ID and sequence strings for all the alleles. - // TODO: we only use some of these - vector sequences; - vector id_lists; - // Also the flags for whether alts are reference (i.e. known) - vector is_ref; - - for (size_t i = 0; i < locus.allele_size(); i++) { - - // For each allele path in the Locus - auto& path = locus.allele(i); -#ifdef debug - cerr << "Extracting allele " << i << ": " << pb2json(path) << endl; -#endif - // Make a stream for the sequence of the path - stringstream sequence_stream; - // And for the description of involved IDs - stringstream id_stream; - - for (size_t j = 0; j < path.mapping_size(); j++) { - // For each mapping along the path - auto& mapping = path.mapping(j); - - // Record the sequence - string node_sequence = augmented.graph.get_node(mapping.position().node_id())->sequence(); - if (mapping.position().is_reverse()) { - node_sequence = reverse_complement(node_sequence); - } - sequence_stream << node_sequence; -#ifdef debug - cerr << "\tMapping: " << pb2json(mapping) << ", sequence " << node_sequence << endl; -#endif - if (j != 0) { - // Add a separator - id_stream << "_"; - } - // Record the ID - id_stream << mapping.position().node_id(); - - if (augmented.translator.has_translation(mapping.position(), false)) { - // This node is derived from an original graph node. Remember it. - original_nodes.insert(augmented.translator.translate(mapping.position()).node_id()); - } - } - - // Remember the descriptions of the alleles - if (site_backward) { - sequences.push_back(reverse_complement(sequence_stream.str())); - } else { - sequences.push_back(sequence_stream.str()); - } -#ifdef debug - cerr << "Recorded allele sequence " << sequences.back() << endl; -#endif - id_lists.push_back(id_stream.str()); - // And whether they're reference or not - if (augmented.has_base_graph()) { - is_ref.push_back(is_reference(path, augmented)); - } - } - - // Start off declaring the variable part to start at the start of - // the first anchoring node. We'll clip it back later to just what's - // after the shared prefix. - size_t variation_start = min(primary_path.get_index().by_id.at(site->start().node_id()).first, - primary_path.get_index().by_id.at(site->end().node_id()).first); - - // Keep track of the alleles that actually need to go in the VCF: - // ref, best, and second-best (if any), some of which may overlap. - // This is the order they will show up in the variant. - vector used_alleles; - used_alleles.push_back(0); - if (best_allele != 0) { - used_alleles.push_back(best_allele); - } - if(second_best_allele != -1 && second_best_allele != 0) { - used_alleles.push_back(second_best_allele); - } - - // Rewrite the sequences and variation_start to just represent the - // actually variable part, by dropping any common prefix and common - // suffix. We just do the whole thing in place, modifying the used - // entries in sequences. - - auto shared_prefix_length = [&](bool backward) { - size_t shortest_prefix = std::numeric_limits::max(); - - auto here = used_alleles.begin(); - if (here == used_alleles.end()) { - // No strings. - // Say no prefix is in common... - return (size_t) 0; - } - auto next = here; - next++; - - if (next == used_alleles.end()) { - // Only one string. - // Say no prefix is in common... - return (size_t) 0; - } - - while (next != used_alleles.end()) { - // Consider each allele and the next one after it, as - // long as we have both. - - // Figure out the shorter and the longer string - string* shorter = &sequences.at(*here); - string* longer = &sequences.at(*next); - if (shorter->size() > longer->size()) { - swap(shorter, longer); - } - - // Calculate the match length for this pair - size_t match_length; - if (backward) { - // Find out how far in from the right the first mismatch is. - auto mismatch_places = std::mismatch(shorter->rbegin(), shorter->rend(), longer->rbegin()); - match_length = std::distance(shorter->rbegin(), mismatch_places.first); - } else { - // Find out how far in from the left the first mismatch is. - auto mismatch_places = std::mismatch(shorter->begin(), shorter->end(), longer->begin()); - match_length = std::distance(shorter->begin(), mismatch_places.first); - } - - // The shared prefix of these strings limits the longest - // prefix shared by all strings. - shortest_prefix = min(shortest_prefix, match_length); - - here = next; - ++next; - } - - // Return the shortest universally shared prefix - return shortest_prefix; - }; - if (!leave_shared_ends) { - // Find and trim off the shared suffix - size_t shared_suffix = shared_prefix_length(true); - for (auto allele : used_alleles) { - sequences[allele] = sequences[allele].substr(0, sequences[allele].size() - shared_suffix); - } - - // Then trim off the shared prefix - size_t shared_prefix = shared_prefix_length(false); - for (auto allele : used_alleles) { - sequences[allele] = sequences[allele].substr(shared_prefix); - } - // Add it onto the start coordinate - // Todo: are we absolutely sure that we're advancing along the reference in both paths? - variation_start += shared_prefix; - } - - - // Make a Variant - vcflib::Variant variant; - variant.sequenceName = contig_names_by_path_name.at(primary_path.get_name()); - variant.setVariantCallFile(vcf); - variant.quality = 0; - // Position should be 1-based and offset with our offset option. - variant.position = variation_start + 1 + variant_offset; - - // Set the ID based on the IDs of the involved nodes. Note that the best - // allele may have no nodes (because it's a pure edge) - variant.id = id_lists.at(best_allele); - if(second_best_allele != -1 && !id_lists.at(second_best_allele).empty()) { - // Add the second best allele's nodes in. - variant.id += "-" + id_lists.at(second_best_allele); - } - - - if(sequences.at(0).empty() || - (best_allele != -1 && sequences.at(best_allele).empty()) || - (second_best_allele != -1 && sequences.at(second_best_allele).empty())) { - - // Fix up the case where we have an empty allele. - - // We need to grab the character before the variable part of the - // site in the reference. - assert(variation_start > 0); - string extra_base = char_to_string(primary_path.get_index().sequence.at(variation_start - 1)); - - for(auto& seq : sequences) { - // Stick it on the front of all the allele sequences - seq = extra_base + seq; - } - - // Budge the variant left - variant.position--; - } - - // Make sure the ref allele is correct - { - string real_ref = primary_path.get_index().sequence.substr( - variant.position - variant_offset - 1, sequences.front().size()); - string got_ref = sequences.front(); - - if (real_ref != got_ref) { - cerr << "Error: Ref should be " << real_ref << " but is " << got_ref << " at " << variant.position << endl; - throw runtime_error("Reference mismatch at site " + pb2json(*site)); - } - - } - - // Add the ref allele to the variant - create_ref_allele(variant, sequences.front()); - - // Add the best allele - assert(best_allele != -1); - int best_alt = add_alt_allele(variant, sequences.at(best_allele)); - - int second_best_alt = (second_best_allele == -1) ? -1 : add_alt_allele(variant, sequences.at(second_best_allele)); - - // Say we're going to spit out the genotype for this sample. - variant.format.push_back("GT"); - auto& genotype_vector = variant.samples[sample_name]["GT"]; - - if (locus.genotype_size() > 0) { - // We actually made a call. Emit the first genotype, which is the call. - - // We need to rewrite the allele numbers to alt numbers, since - // we aren't keeping all the alleles in the VCF, so we can't use - // the natural conversion of Genotype to VCF genotype string. - - // Emit parts into this stream - stringstream stream; - for (size_t i = 0; i < genotype.allele_size(); i++) { - // For each allele called as present in the genotype - - // Convert from allele number to alt number - if (genotype.allele(i) == best_allele) { - stream << best_alt; - } else if (genotype.allele(i) == second_best_allele) { - stream << second_best_alt; - } else { - throw runtime_error("Allele " + to_string(genotype.allele(i)) + - " is not best or second-best and has no alt"); - } - - if (i + 1 != genotype.allele_size()) { - // Write a separator after all but the last one - stream << (genotype.is_phased() ? '|' : '/'); - } - } - // Save the finished genotype - genotype_vector.push_back(stream.str()); - } else { - // Say there's no call here - genotype_vector.push_back("./."); - } - - // Now fill in all the other variant info/format stuff - if(augmented.has_base_graph() && - ((best_allele != 0 && is_ref.at(best_allele)) || - (second_best_allele != 0 && second_best_allele != -1 && is_ref.at(second_best_allele)))) { - // Flag the variant as reference if either of its two best alleles - // is known but not the primary path. Don't put in a false entry if - // it isn't known, because vcflib will spit out the flag anyway... - variant.infoFlags["XREF"] = true; - } - - for (auto id : original_nodes) { - // Add references to the relevant original nodes - variant.info["XSEE"].push_back(to_string(id)); - } - - - // Now fill in all the other variant info/format stuff and emit it - add_variant_info_and_emit(variant, augmented, locus, genotype, best_allele, second_best_allele, used_alleles, - baseline_support, global_baseline_support); -} - -void SupportCaller::emit_recall_variant(map& contig_names_by_path_name, - vcflib::VariantCallFile& vcf, - SupportAugmentedGraph& augmented, - Support& baseline_support, - Support& global_baseline_support, - const Locus& locus, PrimaryPath& primary_path, const Snarl* site, - const vcflib::Variant* recall_variant) { - - vcflib::Variant variant; - variant.setVariantCallFile(vcf); - variant.sequenceName = recall_variant->sequenceName; - variant.position = recall_variant->position; - variant.id = recall_variant->id; - variant.ref = recall_variant->ref; - variant.alt = recall_variant->alt; - variant.alleles = recall_variant->alleles; - variant.quality = 0; - variant.updateAlleleIndexes(); - - // Say we're going to spit out the genotype for this sample. - variant.format.push_back("GT"); - auto& genotype_vector = variant.samples[sample_name]["GT"]; - - const Genotype& genotype = locus.genotype(0); - int best_allele = genotype.allele_size() > 0 ? genotype.allele(0) : -1; - int second_best_allele = genotype.allele_size() > 1 ? genotype.allele(1) : -1; - - // We "use" every allele in the variant, because we're emitting the original variant. - vector used_alleles; - for (int i = 0; i < variant.alleles.size(); ++i) { - used_alleles.push_back(i); - } - - if (best_allele >= 0) { - // We actually made a call. Emit the first genotype, which is the call. - - // We need to rewrite the allele numbers to alt numbers, since - // we aren't keeping all the alleles in the VCF, so we can't use - // the natural conversion of Genotype to VCF genotype string. - - // Emit parts into this stream - stringstream stream; - for (size_t i = 0; i < genotype.allele_size(); i++) { - stream << genotype.allele(i); - - if (i + 1 != genotype.allele_size()) { - // Write a separator after all but the last one - stream << (genotype.is_phased() ? '|' : '/'); - } - } - // Save the finished genotype - genotype_vector.push_back(stream.str()); - } else { - // Say there's no call here - genotype_vector.push_back("./."); - } - -#ifdef debug - cerr << "Recalling variant at " << variant.sequenceName << ":" << variant.position - << " and assigning GT " << genotype_vector.back() << endl; -#endif - - - add_variant_info_and_emit(variant, augmented, locus, genotype, best_allele, second_best_allele, - used_alleles, baseline_support, global_baseline_support); -} - - -void SupportCaller::add_variant_info_and_emit(vcflib::Variant& variant, SupportAugmentedGraph& augmented, - const Locus& locus, const Genotype& genotype, - int best_allele, int second_best_allele, - const vector& used_alleles, - Support& baseline_support, Support& global_baseline_support) { - - // Set up the depth format field - variant.format.push_back("DP"); - // And expected depth - variant.format.push_back("XDP"); - // And allelic depth - variant.format.push_back("AD"); - // And the log likelihood from the assignment of reads among the - // present alleles - variant.format.push_back("XADL"); - // And strand bias - variant.format.push_back("SB"); - // Also the alt allele depth - variant.format.push_back("XAAD"); - - // Compute the total support for all the alts that will be appearing - Support total_support; - // And total alt allele depth for the alt alleles - Support alt_support; - - if (best_allele >= 0) { //only add info if we made a call - for (int allele : used_alleles) { - // For all the alleles we are using, look at the support. - auto& support = locus.support(allele); - - // Set up allele-specific stats for the allele - variant.samples[sample_name]["AD"].push_back(to_string((int64_t)round(total(support)))); - variant.samples[sample_name]["SB"].push_back(to_string((int64_t)round(support.forward()))); - variant.samples[sample_name]["SB"].push_back(to_string((int64_t)round(support.reverse()))); - - // Sum up into total depth - total_support += support; - - if (allele != 0) { - // It's not the primary reference allele - alt_support += support; - } - } - } - - // Find the min total support of anything called - double min_site_support = genotype.allele_size() > 0 ? INFINITY : 0; - double min_site_quality = genotype.allele_size() > 0 ? INFINITY : 0; - - for (size_t i = 0; i < genotype.allele_size(); i++) { - // Min all the total supports from the non-ref alleles called as present - min_site_support = min(min_site_support, total(locus.support(genotype.allele(i)))); - min_site_quality = min(min_site_quality, locus.support(genotype.allele(i)).quality()); - } - - // Find the binomial bias between the called alleles, if multiple were called. - double ad_log_likelihood = INFINITY; - if (second_best_allele != -1) { - // How many of the less common one do we have? - size_t successes = round(total(locus.support(second_best_allele))); - // Out of how many chances - size_t trials = successes + (size_t) round(total(locus.support(best_allele))); - - assert(trials >= successes); - - // How weird is that? - ad_log_likelihood = binomial_cmf_ln(prob_to_logprob((real_t) 0.5), trials, successes); - - assert(!std::isnan(ad_log_likelihood)); - - variant.samples[sample_name]["XADL"].push_back(to_string(ad_log_likelihood)); - } else { - // No need to assign reads between two alleles - variant.samples[sample_name]["XADL"].push_back("."); - } - - // Set the variant's total depth - string depth_string = to_string((int64_t)round(total(total_support))); - variant.info["DP"].push_back(depth_string); // We only have one sample, so variant depth = sample depth - - // And for the sample - variant.samples[sample_name]["DP"].push_back(depth_string); - - // Set the sample's local and global expected depth - variant.samples[sample_name]["XDP"].push_back(to_string((int64_t)round(total(baseline_support)))); - variant.samples[sample_name]["XDP"].push_back(to_string((int64_t)round(total(global_baseline_support)))); - - // And its depth of non-0 alleles - variant.samples[sample_name]["XAAD"].push_back(to_string((int64_t)round(total(alt_support)))); - - // Set the total support quality of the min allele as the variant quality - variant.quality = min_site_quality; - - // Now do the filters - variant.filter = "PASS"; - if (min_site_support < min_mad_for_filter) { - // Apply Min Allele Depth cutoff across all alleles (even ref) - variant.filter = "lowad"; - } else if (max_dp_for_filter != 0 && total(total_support) > max_dp_for_filter) { - // Apply the max depth cutoff - variant.filter = "highabsdp"; - } else if (max_dp_multiple_for_filter != 0 && - total(total_support) > max_dp_multiple_for_filter * total(global_baseline_support)) { - // Apply the max depth multiple cutoff - // TODO: Different standard for sites called as haploid - variant.filter = "highreldp"; - } else if (max_local_dp_multiple_for_filter != 0 && - total(total_support) > max_local_dp_multiple_for_filter * total(baseline_support)) { - // Apply the max local depth multiple cutoff - // TODO: Different standard for sites called as haoploid - variant.filter = "highlocaldp"; - } else if (min_ad_log_likelihood_for_filter != 0 && - ad_log_likelihood < min_ad_log_likelihood_for_filter) { - // We have a het, but the assignment of reads between the two branches is just too weird - variant.filter = "lowxadl"; - } - - auto& genotype_vector = variant.samples[sample_name]["GT"]; - - // Don't bother with trivial calls - if (write_trivial_calls || !((string)recall_vcf_filename).empty() || - (genotype_vector.back() != "./." && genotype_vector.back() != ".|." && - genotype_vector.back() != "0/0" && genotype_vector.back() != "0|0" && - genotype_vector.back() != "." && genotype_vector.back() != "0")) { - - if(can_write_alleles(variant)) { - // No need to check for collisions because we assume sites are correctly found. - // Output the created VCF variant. -#pragma omp critical (cout) - cout << variant << endl; - - } else { - if (verbose) { - cerr << "Variant is too large" << endl; - } - // TODO: track bases lost again - } - } -} - -// this was main() in glenn2vcf -void SupportCaller::call( - // Augmented graph - SupportAugmentedGraph& augmented, - SnarlManager& site_manager, - // Should we load a pileup and print out pileup info as comments after - // variants? - string pileup_filename) { - - // Toggle support counter - support_val = use_support_count ? total : support_quality; - - // Set up the graph's paths properly after augmentation modified them. - augmented.graph.paths.sort_by_mapping_rank(); - augmented.graph.paths.rebuild_mapping_aux(); - - // Make a list of the specified or autodetected primary reference paths. - vector primary_path_names = ref_path_names; - if (primary_path_names.empty()) { - // Try and guess reference path names for VCF conversion or coverage measurement. - if (verbose) { - std:cerr << "Graph has " << augmented.graph.paths.size() << " paths to choose from." - << endl; - } - if(augmented.graph.paths.size() == 1) { - // Autodetect the reference path name as the name of the only path - primary_path_names.push_back((*augmented.graph.paths._paths.begin()).first); - } else if (augmented.graph.paths.has_path("ref")) { - // Take any "ref" path. - primary_path_names.push_back("ref"); - } - - if (verbose && !primary_path_names.empty()) { - cerr << "Guessed reference path name of " << primary_path_names.front() << endl; - } - - } - - // We'll fill this in with a PrimaryPath for every primary reference path - // that is specified or detected. - primary_paths.clear(); - for (auto& name : primary_path_names) { - // Make a PrimaryPath for every primary path we have. - // Index the primary path and compute the binned supports. - primary_paths.emplace(std::piecewise_construct, - std::forward_as_tuple(name), - std::forward_as_tuple(augmented, name, ref_bin_size)); - - auto& primary_path = primary_paths.at(name); - - if (verbose) { - cerr << "Primary path " << name << " average/off-path assumed coverage: " - << primary_path.get_average_support() << endl; - cerr << "Mininimum binned average coverage: " << primary_path.get_bin(primary_path.get_min_bin()) << " (bin " - << (primary_path.get_min_bin() + 1) << " / " << primary_path.get_total_bins() << ")" << endl; - cerr << "Maxinimum binned average coverage: " << primary_path.get_bin(primary_path.get_max_bin()) << " (bin " - << (primary_path.get_max_bin() + 1) << " / " << primary_path.get_total_bins() << ")" << endl; - } - } - - // If applicable, load the pileup. - // This will hold pileup records by node ID. - map node_pileups; - - function handle_pileup = [&](Pileup& p) { - // Handle each pileup chunk - for(size_t i = 0; i < p.node_pileups_size(); i++) { - // Pull out every node pileup - auto& pileup = p.node_pileups(i); - // Save the pileup under its node's pointer. - node_pileups[pileup.node_id()] = pileup; - } - }; - if(!pileup_filename.empty()) { - // We have to load some pileups - ifstream in; - in.open(pileup_filename.c_str()); - vg::io::for_each(in, handle_pileup); - } - - // Make a VCF because we need it in scope later, if we are outputting VCF. - vcflib::VariantCallFile vcf; - - // We also might need to fillin this contig names by path name map - map contig_names_by_path_name; - - if (convert_to_vcf) { - // Do initial setup for VCF output - - // Decide on names and lengths for all the primary paths. - vector contig_lengths; - vector contig_names; - - for (size_t i = 0; i < primary_path_names.size(); i++) { - if (i < contig_name_overrides.size()) { - // Override this name - contig_names.push_back(contig_name_overrides.at(i)); - } else { - // Keep the path name from the graph - contig_names.push_back(primary_path_names.at(i)); - } - - // Allow looking up the assigned contig name later - contig_names_by_path_name[primary_path_names.at(i)] = contig_names.back(); - - if (i < length_overrides.size()) { - // Override this length - contig_lengths.push_back(length_overrides.at(i)); - } else { - // Grab the length from the index - contig_lengths.push_back(primary_paths.at(primary_path_names.at(i)).get_index().sequence.size()); - } - - // TODO: is this fall-through-style logic smart, or will we just - // neglect to warn people that they forgot options by parsing what - // they said when they provide too few overrides? - } - - // Generate a vcf header. We can't make Variant records without a - // VariantCallFile, because the variants need to know which of their - // available info fields or whatever are defined in the file's header, - // so they know what to output. - stringstream header_stream; - write_vcf_header(header_stream, {sample_name}, contig_names, contig_lengths, - min_mad_for_filter, max_dp_for_filter, max_dp_multiple_for_filter, max_local_dp_multiple_for_filter, - min_ad_log_likelihood_for_filter, augmented.has_base_graph()); - - // Load the headers into a the VCF file object - string header_string = header_stream.str(); - assert(vcf.openForOutput(header_string)); - - // Spit out the header - cout << header_stream.str(); - } - - // Find all the top-level sites - list site_queue; - - site_manager.for_each_top_level_snarl([&](const Snarl* site) { - // Stick all the sites in this vector. - site_queue.emplace_back(site); - }); - - // We're going to run through all the top-level sites and keep just what we - // can use. If we're converting to VCF it's only stuff on a primary path, - // and we will break top-level sites to find things on a primary path. - // Otherwise it's everything. - vector sites; - - while(!site_queue.empty()) { - // Grab the first site - const Snarl* site = move(site_queue.front()); - site_queue.pop_front(); - - // If the site is strung on any of the primary paths, find the - // corresponding PrimaryPath object. Otherwise, leave this null. - PrimaryPath* primary_path = nullptr; - { - auto found = find_path(*site); - if (found != primary_paths.end()) { - primary_path = &found->second; - } - } - - // What have to be true about the site for us to find traversals of it? - // We can handle some things that aren't ultrabubbles, but we can't yet - // handle general snarls. - auto is_traversable = [](const Snarl* s) { - return s->start_end_reachable() && s->directed_acyclic_net_graph(); - }; - - // We don't just need to check the top snarl; we also need to enforce - // this on all child snarls, since when genotyping we need to recurse - // on the children to avoid being biased away from the paths they lie - // on when calling the parents. We can only skip parents and replace - // them with their children. - function recursively_traversable = [&](const Snarl* s) -> bool { - if (!is_traversable(s)) { - return false; - } - - for (const Snarl* child : site_manager.children_of(s)) { - if (!recursively_traversable(child)) { - return false; - } - } - - return true; - }; - - bool site_traversable = recursively_traversable(site); - - - if (site_traversable && primary_path != nullptr) { - // This site is traversable through and is on a primary path - - // Throw it in the final vector of sites we're going to process. - sites.push_back(site); - } else if (site_traversable && !convert_to_vcf) { - // This site is traversable through and we can handle things off - // the primary path. - - // Throw it in the final vector of sites we're going to process. - sites.push_back(site); - - } else { - // The site is not on the primary path or isn't an ultrabubble, but - // maybe one of its children will meet our requirements. - - size_t child_count = site_manager.children_of(site).size(); - - for(const Snarl* child : site_manager.children_of(site)) { - // Dump all the children into the queue for separate - // processing. - site_queue.emplace_back(child); - } - - if (verbose) { - if (child_count) { - cerr << "Broke up off-reference site into " - << child_count << " children" << endl; - } else { - cerr << "Dropped off-reference site" << endl; - } - } - - } - } - - if (verbose) { - cerr << "Found " << sites.size() << " sites" << endl; - } - - auto get_path_index = [&](const Snarl& site) -> PathIndex* { - // When the TraversalFinder needs a primary path index for a site, it can look it up with this function. - auto found = find_path(site); - if (found != primary_paths.end()) { - // It's on a path - return &found->second.get_index(); - } else { - // It's not on a known primary path, so the TraversalFinder should make its own backbone path - return nullptr; - } - }; - - unique_ptr traversal_finder; - - if (!((string)recall_vcf_filename).empty()) { - - auto skip_alt = [&] (const SnarlTraversal& alt_path) -> bool { - Support avg_support; - size_t total_size; - tie(std::ignore, avg_support, total_size) = get_traversal_support( - augmented, site_manager, nullptr, alt_path, {}, nullptr); - return total_size > 0 && total(avg_support) / (double)total_size < min_alt_path_support; - }; - - // we are genotyping a VCF. load it and make sure we only traverse its alleles - vector ref_path_names; - for (const auto& primary_path : primary_paths) { - ref_path_names.push_back(primary_path.first); - } - traversal_finder = unique_ptr(new VCFTraversalFinder(augmented.graph, site_manager, - variant_file, ref_path_names, - ref_fasta.get(), - ins_fasta.get(), - skip_alt)); - (bool&)leave_shared_ends = true; - } - - else { - // Now start looking for traversals of the sites. - function get_node_support = nullptr; - function get_edge_support = nullptr; - if (augmented.has_supports()) { - get_node_support = [&augmented](id_t node_id) { - return augmented.get_support(node_id); - }; - get_edge_support = [&augmented](edge_t edge) { - return augmented.get_support(edge); - }; - } - RepresentativeTraversalFinder* rep_trav_finder = new RepresentativeTraversalFinder(augmented.graph, site_manager, - max_search_depth, - max_search_width, - max_bubble_paths, - min_total_support_for_call, - min_total_support_for_call, - get_path_index, - get_node_support, - get_edge_support); - rep_trav_finder->other_orientation_timeout = max_inversion_size; - traversal_finder = unique_ptr(rep_trav_finder); - } - - // We're going to remember what nodes and edges are covered by sites, so we - // will know which nodes/edges aren't in any sites and may need generic - // presence/absence calls. - set covered_nodes; - set covered_edges; - - // When we genotype the sites into Locus objects, we will use this buffer for outputting them. - vector locus_buffer; - - // How many sites result in output? - size_t called_loci = 0; - - -#pragma omp parallel for - for(size_t site_number = 0; site_number < sites.size(); ++site_number) { - const Snarl* site = sites[site_number]; - // For every site, we're going to make a bunch of Locus objects - - // See if the site is on a primary path, so we can use binned support. - map::iterator found_path = find_path(*site); - - // We need to figure out how much support a site ought to have. - // Within its local bin? - Support baseline_support; - // On its primary path? - Support global_baseline_support; - if (expected_coverage != 0.0) { - // Use the specified coverage override - baseline_support.set_forward(expected_coverage / 2); - baseline_support.set_reverse(expected_coverage / 2); - global_baseline_support = baseline_support; - } else if (found_path != primary_paths.end()) { - // We're on a primary path, so we can find the appropriate bin - - // Since the variable part of the site is after the first anchoring node, where does it start? - // Account for the site possibly being backward on the path. - size_t variation_start = min(found_path->second.get_index().by_id.at(site->start().node_id()).first - + augmented.graph.get_node(site->start().node_id())->sequence().size(), - found_path->second.get_index().by_id.at(site->end().node_id()).first - + augmented.graph.get_node(site->end().node_id())->sequence().size()); - - // Look in the bins for the primary path to get the support there. - baseline_support = found_path->second.get_support_at(variation_start); - - // And grab the path's overall support - global_baseline_support = found_path->second.get_average_support(); - - } else { - // Just use the primary paths' average support, which may be 0 if there are none. - // How much support is expected across all the primary paths? May be 0 if there are no primary paths. - global_baseline_support = PrimaryPath::get_average_support(primary_paths); - baseline_support = global_baseline_support; - } - - // Recursively type the site, using that support and an assumption of a diploid sample. - find_best_traversals(augmented, site_manager, traversal_finder.get(), *site, baseline_support, 2, - [&](const Locus& locus, const Snarl* site, const vcflib::Variant* recall_variant = nullptr) { - - // Now we have the Locus with call information, and the site (either - // the root snarl we passed in or a child snarl) that the call is - // for. We need to output the call. - if (convert_to_vcf) { - // We want to emit VCF - - // Look up the path this child site lives on. (TODO: just capture and use the path the parent lives on?) - auto found_path = find_path(*site); - if(found_path != primary_paths.end()) { - // And this site is on a primary path - - // Emit the variant for this Locus - if (recall_variant != nullptr) { - emit_recall_variant(contig_names_by_path_name, vcf, augmented, baseline_support, - global_baseline_support, locus, found_path->second, site, recall_variant); - } else { - emit_variant(contig_names_by_path_name, vcf, augmented, baseline_support, - global_baseline_support, locus, found_path->second, site); - } - } - // Otherwise discard it as off-path - // TODO: update bases lost - } else { -#pragma omp critical (cout) - { - // Emit the locus itself - locus_buffer.push_back(locus); - vg::io::write_buffered(cout, locus_buffer, locus_buffer_size); - } - } - - // We called a site -#pragma omp atomic - called_loci++; - - // Mark all the nodes and edges in the site as covered - if (!convert_to_vcf) { - auto contents = site_manager.deep_contents(site, augmented.graph, true); -#pragma omp critical (covered) - { - for (auto& node_id : contents.first) { - Node* node = augmented.graph.get_node(node_id); - covered_nodes.insert(node); - } - for (auto& edge_handle : contents.second) { - Edge* edge = augmented.graph.get_edge( - NodeTraversal(augmented.graph.get_node(augmented.graph.get_id(edge_handle.first)), - augmented.graph.get_is_reverse(edge_handle.first)), - NodeTraversal(augmented.graph.get_node(augmented.graph.get_id(edge_handle.second)), - augmented.graph.get_is_reverse(edge_handle.second))); - covered_edges.insert(edge); - } - } - } - }); - } - - if (verbose) { - cerr << "Called " << called_loci << " loci" << endl; - } - - // OK now we have handled all the real sites. But there are still nodes and - // edges that we might want to call as present or absent. - - - if (!convert_to_vcf) { - - size_t extra_loci = 0; - - if (call_other_by_coverage) { - // We should look at the coverage of things off the primary path and - // make calls on them. - - augmented.graph.for_each_edge([&](Edge* e) { - // We want to make calls on all the edges that aren't covered yet - if (covered_edges.count(e)) { - // Skip this edge - return; - } - - // Make a couple of fake Visits - Visit from_visit; - from_visit.set_node_id(e->from()); - from_visit.set_backward(e->from_start()); - Visit to_visit; - to_visit.set_node_id(e->to()); - to_visit.set_backward(e->to_end()); - - // Make a Locus for the edge - Locus locus; - - // Give it an allele - Path* path = locus.add_allele(); - - // Fill in - *path->add_mapping() = to_mapping(from_visit, augmented.graph); - *path->add_mapping() = to_mapping(to_visit, augmented.graph); - - // Set the support - edge_t edge_handle = augmented.graph.edge_handle(augmented.graph.get_handle(e->from(), e->from_start()), - augmented.graph.get_handle(e->to(), e->to_end())); - *locus.add_support() = augmented.get_support(edge_handle); - *locus.mutable_overall_support() = augmented.get_support(edge_handle); - - // Decide on the genotype - Genotype gt; - - // TODO: use the coverage bins - if (support_val(locus.support(0)) > support_val(PrimaryPath::get_average_support(primary_paths)) * 0.25) { - // We're closer to 1 copy than 0 copies - gt.add_allele(0); - - if (support_val(locus.support(0)) > support_val(PrimaryPath::get_average_support(primary_paths)) * 0.75) { - // We're closer to 2 copies than 1 copy - gt.add_allele(0); - } - } - // Save the genotype with 0, 1, or 2 copies. - *locus.add_genotype() = gt; - - // Send out the locus - locus_buffer.push_back(locus); - vg::io::write_buffered(cout, locus_buffer, locus_buffer_size); - - extra_loci++; - - // TODO: look at average node coverages and do node loci (in - // case any nodes have no edges?) - - }); - } else { - // We should just assert the existence of the primary path edges - // that weren't in something. Everything not asserted will get - // subsetted out. - - for (auto& kv : primary_paths) { - // For every primary path in the graph - auto& primary_path = kv.second; - - // Remember the end of the previous ndoe - NodeSide previous_end; - - for (auto& offset_and_side : primary_path.get_index()) { - // For every NodeSide that happens in the primary path - - // Grab the side - NodeSide here = offset_and_side.second; - - if (previous_end == NodeSide()) { - // Skip the first node and remember its end - previous_end = here.flip(); - continue; - } - - // Find the edge we crossed - Edge* crossed = augmented.graph.get_edge(previous_end, here); - assert(crossed != nullptr); - - if (covered_edges.count(crossed)) { - // If the edge we crossed is covered by a snarl, don't - // emit anything. - previous_end = here.flip(); - continue; - } - - // If the edge we're crossing isn't covered, we should - // assert the primary path here. - - // Make a couple of fake Visits - Visit from_visit; - from_visit.set_node_id(crossed->from()); - from_visit.set_backward(crossed->from_start()); - Visit to_visit; - to_visit.set_node_id(crossed->to()); - to_visit.set_backward(crossed->to_end()); - - // Make a Locus for the edge - Locus locus; - - // Give it an allele - Path* path = locus.add_allele(); - - // Fill in - *path->add_mapping() = to_mapping(from_visit, augmented.graph); - *path->add_mapping() = to_mapping(to_visit, augmented.graph); - - edge_t crossed_handle = augmented.graph.edge_handle(augmented.graph.get_handle(crossed->from(), crossed->from_start()), - augmented.graph.get_handle(crossed->to(), crossed->to_end())); - // Set the support - *locus.add_support() = augmented.get_support(crossed_handle); - *locus.mutable_overall_support() = augmented.get_support(crossed_handle); - - // Decide on the genotype of hom ref. - Genotype* gt = locus.add_genotype(); - gt->add_allele(0); - gt->add_allele(0); - - // Send out the locus - locus_buffer.push_back(locus); - vg::io::write_buffered(cout, locus_buffer, locus_buffer_size); - - extra_loci++; - - // Make sure to remember the end of the node we just did, - // for looking at the next node. - previous_end = here.flip(); - - } - } - } - - // Flush the buffer of Locus objects we have to write - vg::io::write_buffered(cout, locus_buffer, 0); - - if (verbose) { - cerr << "Called " << extra_loci << " extra loci with copy number estimates" << endl; - } - - } - -} - -bool SupportCaller::is_reference(const SnarlTraversal& trav, AugmentedGraph& augmented) { - - // Keep track of the previous NodeSide - NodeSide previous; - - // We'll call this function with each visit in turn. - // If it ever returns false, the whole thing is nonreference. - auto experience_visit = [&](const Visit& visit) { - // TODO: handle nested sites - assert(visit.node_id()); - - if (previous.node != 0) { - // Consider the edge from the previous visit - Edge* edge = augmented.graph.get_edge(previous, to_left_side(visit)); - - if (augmented.is_novel_edge(edge)) { - // Found a novel edge! - return false; - } - } - - if (augmented.is_novel_node(augmented.graph.get_node(visit.node_id()))) { - // This node itself is novel - return false; - } - - // Remember we want an edge from this visit when we look at the next - // one. - previous = to_right_side(visit); - - // This visit is known. - return true; - }; - - // Experience the entire traversal from start to end - for (size_t i = 0; i < trav.visit_size(); i++) { - if (!experience_visit(trav.visit(i))) { - return false; - } - } - - // And if we make it through it's a reference traversal. - return true; - -} - -bool SupportCaller::is_reference(const Path& path, AugmentedGraph& augmented) { - - // The path can't be empty because it's not clear if an empty path should be - // reference or not. - assert(path.mapping_size() != 0); - - for (size_t i = 0; i < path.mapping_size(); i++) { - // Check each mapping - auto& mapping = path.mapping(i); - - if (augmented.is_novel_node(augmented.graph.get_node(mapping.position().node_id()))) { - // We use a novel node - return false; - } - - if (i + 1 < path.mapping_size()) { - // Also look at the next mapping - auto& next_mapping = path.mapping(i + 1); - - // And see about the edge to it - Edge* edge = augmented.graph.get_edge(to_right_side(to_visit(mapping)), to_left_side(to_visit(next_mapping))); - if (augmented.is_novel_edge(edge)) { - // We used a novel edge - return false; - } - } - } - - // If we get through everything it's reference. - return true; -} - -} diff --git a/src/support_caller.hpp b/src/support_caller.hpp deleted file mode 100644 index a86d3021b1b..00000000000 --- a/src/support_caller.hpp +++ /dev/null @@ -1,452 +0,0 @@ -#ifndef VG_SUPPORT_CALLER_HPP_INCLUDED -#define VG_SUPPORT_CALLER_HPP_INCLUDED - -#include -#include -#include -#include -#include -#include -#include -#include -#include "vg.hpp" -#include "hash_map.hpp" -#include "utility.hpp" -#include "pileup.hpp" -#include "path_index.hpp" -#include "genotypekit.hpp" -#include "option.hpp" -#include "traversal_finder.hpp" - -namespace vg { - -using namespace std; - -/** - * SupportCaller: take an augmented graph from a Caller and produce actual calls in a - * VCF. - */ -class SupportCaller : public Configurable { - -public: - - /** - * Set up to call with default parameters. - */ - SupportCaller() = default; - - /** - * We use this to represent a contig in the primary path, with its index and coverage info. - */ - class PrimaryPath { - public: - /** - * Index the given path in the given augmented graph, and compute all - * the coverage bin information with the given bin size. - */ - PrimaryPath(SupportAugmentedGraph& augmented, const string& ref_path_name, size_t ref_bin_size); - - /** - * Get the support at the bin appropriate for the given primary path - * offset. - */ - const Support& get_support_at(size_t primary_path_offset) const; - - /** - * Get the index of the bin that the given path position falls in. - */ - size_t get_bin_index(size_t primary_path_offset) const; - - /** - * Get the bin with minimal coverage. - */ - size_t get_min_bin() const; - - /** - * Get the bin with maximal coverage. - */ - size_t get_max_bin() const; - - /** - * Get the support in the given bin. - */ - const Support& get_bin(size_t bin) const; - - /** - * Get the total number of bins that the path is divided into. - */ - size_t get_total_bins() const; - - /** - * Get the average support over the path. - */ - Support get_average_support() const; - - /** - * Get the average support over a collection of paths. - */ - static Support get_average_support(const map& paths); - - /** - * Get the total support for the path. - */ - Support get_total_support() const; - - /** - * Get the PathIndex for this primary path. - */ - PathIndex& get_index(); - - /** - * Get the PathIndex for this primary path. - */ - const PathIndex& get_index() const; - - /** - * Gets the path name we are representing. - */ - const string& get_name() const; - - protected: - /// How wide is each coverage bin along the path? - size_t ref_bin_size; - - /// This holds the index for this path - PathIndex index; - - /// This holds the name of the path - string name; - - /// What's the expected in each bin along the path? Coverage gets split - /// evenly over both strands. - vector binned_support; - - /// Which bin has min support? - size_t min_bin; - /// Which bin has max support? - size_t max_bin; - - /// What's the total Support over every bin? - Support total_support; - }; - - // We'll fill this in with a PrimaryPath for every primary reference path - // that is specified or detected. - map primary_paths; - - /** - * Produce calls for the given annotated augmented graph. If a - * pileup_filename is provided, the pileup is loaded again and used to add - * comments describing variants - */ - void call(SupportAugmentedGraph& augmented, SnarlManager& site_manager, string pileup_filename = ""); - - /** - * Get the support and size for each traversal in a list. Discount support - * of minus_traversal if it's specified. Use average_support_switch_threshold and - * use_average_support to decide whether to return min or avg supports. - */ - tuple, vector > get_traversal_supports_and_sizes( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, - const vector& minus_traversals = {}); - - /** - * Get the min support, total support, bp size (to divide total by for average - * support), optionally special-casing the material used by another traversal. - * Material used by another traversal only makes half its coverage available to this traversal. - * If ref_traversal specified (and not same as traversal), its contents will be forbidden from - * boosting the average support to avoid the actual variation from getting - * diluted by shared reference. - */ - tuple get_traversal_support( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl* site, - const SnarlTraversal& traversal, const vector& already_used = {}, - const SnarlTraversal* ref_traversal = nullptr); - - /** - * Get the edge supports of an inversion. This is to be used for computing genotypes with average support, - * as the normal supports will almost always return 0/1 due to the nodes having the same support. If - * the alleles don't refer to a simple inversion, an empty vector is returned (and should be ignored) - */ - vector get_inversion_supports( - SupportAugmentedGraph& augmented, SnarlManager& snarl_manager, const Snarl& site, - const vector& traversals, const vector& traversal_sizes, - int best_allele, int second_best_allele); - - /** - * For the given snarl, find the reference traversal, the best traversal, - * and the second-best traversal, recursively, if any exist. These - * traversals will be fully filled in with nodes. - * - * Only snarls which are ultrabubbles can be called. - * - * Expects the given baseline support for a diploid call. - * - * Will not return more than 1 + copy_budget SnarlTraversals, and will - * return less if some copies are called as having the same traversal. - * - * Does not deduplicate agains the ref traversal; it may be the same as the - * best or second-best. - * - * Uses the given copy number allowance, and emits a Locus for this Snarl - * and any child Snarls. - * - * If no path through the Snarl can be found, emits no Locus and returns no - * SnarlTraversals. - * - */ - vector find_best_traversals(SupportAugmentedGraph& augmented, - SnarlManager& snarl_manager, TraversalFinder* finder, const Snarl& site, - const Support& baseline_support, size_t copy_budget, - function emit_locus); - - /** - * Instead of just emitting the locus, use the site info from the VCF - * to output a locus for each variant in the VCF that touchces our site. - */ - void recall_locus(Locus& locus, const Snarl& site, vector& traversals, - vector>& trav_alleles, - vector& site_variants, - function emit_locus); - - /** This function emits the given variant on the given primary path, as - * VCF. It needs to take the site as an argument because it may be - * called for children of the site we're working on right now. - */ - void emit_variant(map& contig_names_by_path_name, - vcflib::VariantCallFile& vcf, - SupportAugmentedGraph& augmented, - Support& baseline_support, - Support& global_baseline_support, - const Locus& locus, PrimaryPath& primary_path, const Snarl* site); - - /** Like emit_variant, but use the given vcf variant as a template and just - * compute the genotype and info */ - void emit_recall_variant(map& contig_names_by_path_name, - vcflib::VariantCallFile& vcf, - SupportAugmentedGraph& augmented, - Support& baseline_support, - Support& global_baseline_support, - const Locus& locus, PrimaryPath& primary_path, const Snarl* site, - const vcflib::Variant* recall_variant); - - /** add the info fields to a variant and actually emit it - * (used by both emit_variant and emit_recall_variant) */ - void add_variant_info_and_emit(vcflib::Variant& variant, SupportAugmentedGraph& augmented, - const Locus& locus, const Genotype& genotype, - int best_allele, int second_best_allele, - const vector& used_alleles, - Support& baseline_support, Support& global_baseline_support); - - /** - * Decide if the given SnarlTraversal is included in the original base graph - * (true), or if it represents a novel variant (false). - * - * Looks at the nodes in the traversal, and sees if their calls are - * CALL_REFERENCE or not. - * - * Handles single-edge traversals. - * - */ - bool is_reference(const SnarlTraversal& trav, AugmentedGraph& augmented); - - /** - * Decide if the given Path is included in the original base graph (true) or - * if it represents a novel variant (false). - * - * Looks at the nodes, and sees if their calls are CALL_REFERENCE or not. - * - * The path can't be empty; it has to be anchored to something (probably the - * start and end of the snarl it came from). - */ - bool is_reference(const Path& path, AugmentedGraph& augmented); - - /** - * Find the primary path, if any, that the given site is threaded onto. - * - * TODO: can only work by brute-force search. - */ - map::iterator find_path(const Snarl& site); - - /** - * Find the lenght of a deletion deletion edge along (the first found) primary path. If no path found - * or not a deletion edge, return 0. - */ - size_t get_deletion_length(const NodeSide& end1, const NodeSide& end2, SupportAugmentedGraph& augmented); - - /** - * Get the amount of support. Can use this function to toggle between unweighted (total from genotypekit) - * and quality-weighted (support_quality below) in one place. - */ - function support_val; - - static double support_quality(const Support& support) { - return support.quality(); - } - - // Option variables - - /// Should we output in VCF (true) or Protobuf Locus (false) format? - Option convert_to_vcf{this, "no-vcf", "V", true, - "output variants in binary Loci format instead of text VCF format"}; - /// How big should our output buffer be? - size_t locus_buffer_size = 1000; - - /// What are the names of the reference paths, if any, in the graph? - Option> ref_path_names{this, "ref", "r", {}, - "use the path with the given name as a reference path (can repeat)"}; - /// What name should we give each contig in the VCF file? Autodetected from - /// path names if empty or too short. - Option> contig_name_overrides{this, "contig", "c", {}, - "use the given name as the VCF name for the corresponding reference path (can repeat)"}; - /// What should the total sequence length reported in the VCF header be for - /// each contig? Autodetected from path lengths if empty or too short. - Option> length_overrides{this, "length", "l", {}, - "override total sequence length in VCF for the corresponding reference path (can repeat)"}; - /// What name should we use for the sample in the VCF file? - Option sample_name{this, "sample", "S", "SAMPLE", - "name the sample in the VCF with the given name"}; - /// How far should we offset positions of variants? - Option variant_offset{this, "offset", "o", 0, - "offset variant positions by this amount in VCF"}; - /// How many nodes should we be willing to look at on our path back to the - /// primary path? Keep in mind we need to look at all valid paths (and all - /// combinations thereof) until we find a valid pair. - Option max_search_depth{this, "max-search-depth", "D", 1000, - "maximum depth for path search"}; - /// How many search states should we allow on the DFS stack when searching - /// for traversals? - Option max_search_width{this, "max-search-width", "wWmMsS", 1000, - "maximum width for path search"}; - - - /// What fraction of average coverage should be the minimum to call a - /// variant (or a single copy)? Default to 0 because vg call is still - /// applying depth thresholding - Option min_fraction_for_call{this, "min-cov-frac", "F", 0, - "min fraction of average coverage at which to call"}; - /// What fraction of the reads supporting an alt are we willing to discount? - /// At 2, if twice the reads support one allele as the other, we'll call - /// homozygous instead of heterozygous. At infinity, every call will be - /// heterozygous if even one read supports each allele. - Option max_het_bias{this, "max-het-bias", "H", 10, - "max imbalance factor to call heterozygous, alt major on SNPs"}; - /// Like above, but applied to ref / alt ratio (instead of alt / ref) - Option max_ref_het_bias{this, "max-ref-bias", "R", 4.5, - "max imbalance factor to call heterozygous, ref major"}; - /// Like the max het bias, but applies to novel indels. - Option max_indel_het_bias{this, "max-indel-het-bias", "I", 3, - "max imbalance factor to call heterozygous, alt major on indels"}; - /// Like the max het bias, but applies to multiallelic indels. - Option max_indel_ma_bias{this, "max-indel-ma-bias", "G", 6, - "max imbalance factor between ref and alt2 to call 1/2 double alt on indels"}; - /// What's the minimum integer number of reads that must support a call? We - /// don't necessarily want to call a SNP as het because we have a single - // supporting read, even if there are only 10 reads on the site. - Option min_total_support_for_call{this, "min-count", "n", 1, - "min total supporting read count to call a variant"}; - /// Bin size used for counting coverage along the reference path. The - /// bin coverage is used for computing the probability of an allele - /// of a certain depth - Option ref_bin_size{this, "bin-size", "B", 250, - "bin size used for counting coverage"}; - /// On some graphs, we can't get the coverage because it's split over - /// parallel paths. Allow overriding here - Option expected_coverage{this, "avg-coverage", "C", 0.0, - "specify expected coverage (instead of computing on reference)"}; - /// Should we use average support instead of minimum support for our - /// calculations? - Option use_average_support{this, "use-avg-support", "u", false, - "use average instead of minimum support"}; - /// Max traversal length threshold at which we switch from minimum support - /// to average support (so we don't use average support on pairs of adjacent - /// errors and miscall them, but we do use it on long runs of reference - /// inside a deletion where the min support might not be representative. - Option average_support_switch_threshold{this, "use-avg-support-above", "uUaAtT", 100, - "use average instead of minimum support for sites this long or longer"}; - - /// What's the maximum number of bubble path combinations we can explore - /// while finding one with maximum support? - size_t max_bubble_paths = 100; - /// what's the minimum ref or alt allele depth to give a PASS in the filter - /// column? Also used as a min actual support for a second-best allele call - Option min_mad_for_filter{this, "min-mad", "E", 5, - "min. ref/alt allele depth to PASS filter or be a second-best allele"}; - /// what's the maximum total depth to give a PASS in the filter column - Option max_dp_for_filter{this, "max-dp", "MmDdAaXxPp", 0, - "max depth to PASS filter (0 for unlimited)"}; - /// what's the maximum total depth to give a PASS in the filter column, as a - /// multiple of the global baseline coverage? - Option max_dp_multiple_for_filter{this, "max-dp-multiple", "MmDdAaXxPp", 0, - "max portion of global expected depth to PASS filter (0 for unlimited)"}; - /// what's the maximum total depth to give a PASS in the filter column, as a - /// multiple of the local baseline coverage? - Option max_local_dp_multiple_for_filter{this, "max-local-dp-multiple", "MmLlOoDdAaXxPp", 0, - "max portion of local expected depth to PASS filter (0 for unlimited)"}; - /// what's the min log likelihood for allele depth assignments to PASS? - Option min_ad_log_likelihood_for_filter{this, "min-ad-log-likelihood", "MmAaDdLliI", -9.0, - "min log likelihood for AD assignments to PASS filter (0 for unlimited)"}; - - Option write_trivial_calls{this, "trival", "ivtTIRV", false, - "write trivial vcf calls (ex 0/0 genotypes)"}; - - /// Should we call on nodes/edges outside of snarls by coverage (true), or - /// just assert that primary path things exist and off-path things don't - /// (false)? - Option call_other_by_coverage{this, "call-nodes-by-coverage", "cCoObB", false, - "make calls on nodes/edges outside snarls by coverage"}; - - /// Use total support count (true) instead of total support quality (false) when choosing - /// top alleles and deciding gentypes based on the biases. - Option use_support_count{this, "use-support-count", "T", false, - "use total support count instead of total support quality for selecting top alleles"}; - - /// Path of supports file generated from the PileupAugmenter (via vg augment) - Option support_file_name{this, "support-file", "s", {}, - "path of file containing supports generated by vg augment -P -s"}; - - /// Don't collapse the shared prefix and suffix of the different alleles - /// in an output VCF line. This is mostly for debugging. - Option leave_shared_ends{this, "leave-shared-ends", "X", false, - "don't collapse shared prefix and suffix of alleles in VCF output"}; - - Option max_inversion_size{this, "max-inv", "e", 1000, - "maximum detectable inversion size in number of nodes"}; - - Option recall_vcf_filename{this, "recall-vcf", "f", "", - "VCF to genotype against. Must have been used to create input graph with vg construct -a"}; - - Option recall_ref_fasta_filename{this, "recall-fasta", "a", "", - "Reference FASTA required for --recall-vcf in the presence of symbolic deletions or inversions"}; - - Option recall_ins_fasta_filename{this, "insertion-fasta", "Z", "", - "Insertion FASTA required for --recall-vcf in the presence of symbolic insertions"}; - - /// Path of pack file generated from vg pack - Option pack_file_name{this, "pack-file", "P", {}, - "path of pack file from vg pack"}; - - Option xg_file_name{this, "xg-file", "x", {}, - "path of xg file (required to read pack file with -P)"}; - - /// structures to hold the recall vcf and fastas - vcflib::VariantCallFile variant_file; - unique_ptr ref_fasta; - unique_ptr ins_fasta; - /// minimum average base support on alt path for it to be considered - double min_alt_path_support = 0.2; - - /// print warnings etc. to stderr - bool verbose = false; - - /// inversion or deletion edges greater than this length with 0 support - /// will clamp average support down to 0. this is primarily to prevent - /// FP inversions when using average support - int max_unsupported_edge_size = 20; - -}; - -} - -#endif diff --git a/src/vg.cpp b/src/vg.cpp index f1b86ad0569..48275b18c35 100644 --- a/src/vg.cpp +++ b/src/vg.cpp @@ -5253,26 +5253,12 @@ void VG::edit(istream& paths_to_add, // If we are going to actually add the paths to the graph, we need to break at path ends break_at_ends |= save_paths; - function save_fn = nullptr; - if (save_paths) { - save_fn = [&](Path& added) { - paths.extend(added, false, false); - }; - } - // Rebuild path ranks, aux mapping, etc. by compacting the path ranks paths.compact_ranks(); // Augment the graph with the paths, modifying paths in place if update true - augment(this, paths_to_add, out_translations, out_gam_stream, save_fn, + augment(this, paths_to_add, out_translations, out_gam_stream, save_paths, break_at_ends, remove_softclips); - - // Rebuild path ranks, aux mapping, etc. by compacting the path ranks - // Todo: can we just do this once? - paths.compact_ranks(); - - // execute a semi partial order sort on the nodes - sort(); } // The not quite as robust (TODO: how?) but actually efficient way to edit the graph. diff --git a/test/pileup/2snp.gam b/test/pileup/2snp.gam new file mode 100644 index 00000000000..23a45ae7994 Binary files /dev/null and b/test/pileup/2snp.gam differ diff --git a/test/pileup/2snp.gam.vgpu.json b/test/pileup/2snp.gam.vgpu.json new file mode 100644 index 00000000000..725684f74b9 --- /dev/null +++ b/test/pileup/2snp.gam.vgpu.json @@ -0,0 +1 @@ +{"node_pileups": [{"base_pileup": [{"ref_base": 67}, {"bases": ",,", "num_bases": 2, "ref_base": 65}, {"bases": ",,.", "num_bases": 3, "ref_base": 65}, {"bases": ",,..", "num_bases": 4, "ref_base": 65}, {"bases": ",,..", "num_bases": 4, "ref_base": 84}, {"bases": ",,....", "num_bases": 6, "ref_base": 65}, {"bases": ",,....", "num_bases": 6, "ref_base": 65}, {"bases": ",,....", "num_bases": 6, "ref_base": 71}, {"bases": ",,....", "num_bases": 6, "ref_base": 71}, {"bases": ",,....", "num_bases": 6, "ref_base": 67}, {"bases": ",,....", "num_bases": 6, "ref_base": 84}, {"bases": ",,,..,,..", "num_bases": 9, "ref_base": 84}, {"bases": ",,,..,,.,.", "num_bases": 10, "ref_base": 71}, {"bases": ",,,..,,.,.", "num_bases": 10, "ref_base": 71}, {"bases": ",,,.,.,,,.,..", "num_bases": 13, "ref_base": 65}, {"bases": ".,,,..,.,,,.,,,..,", "num_bases": 18, "ref_base": 65}, {"bases": ".,,,...,.,,,,.,,,..,", "num_bases": 20, "ref_base": 65}, {"bases": ".,,,,....,.,,,,.,,,..,", "num_bases": 22, "ref_base": 84}, {"bases": ".,,,,,....,.,,,,.,,,..,", "num_bases": 23, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,..,.", "num_bases": 26, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 67}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 71}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 71}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 65}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 71}, {"bases": "AtttttAtAtAAtAttttAttttAtAttAA", "num_bases": 30, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 84}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 67}, {"bases": ".,,,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 30, "ref_base": 84}, {"bases": ".,,,.,.,..,.,,,,.,,,,.,.,,..", "num_bases": 28, "ref_base": 65}, {"bases": ".,,,.,.,..,.,,,,.,,,,.,,,..", "num_bases": 27, "ref_base": 84}, {"bases": ".,,,.,.,.,.,,,,.,,,,.,,,..", "num_bases": 26, "ref_base": 84}, {"bases": ".,,,.,.,.,.,,,,.,,,,.,,,..", "num_bases": 26, "ref_base": 65}, {"bases": ".,,,.,.,.,,,,,,,,,.,,,..", "num_bases": 24, "ref_base": 84}, {"bases": ".,,,.,.,.,,,,,,,,,.,,,..", "num_bases": 24, "ref_base": 65}, {"bases": "CgggCgCgCgggggggggCgggCC", "num_bases": 24, "ref_base": 84}, {"bases": ".,,,.,.,.,,,,,,,,,.,,,..", "num_bases": 24, "ref_base": 84}, {"bases": ".,,,.,.,.,,,,,,,,,.,,,..", "num_bases": 24, "ref_base": 67}, {"bases": ".,,,.,.,.,,,,,,,,,.,,,..", "num_bases": 24, "ref_base": 67}, {"bases": ".,,.,.,.,,,,,,,.,,,..", "num_bases": 21, "ref_base": 65}, {"bases": ".,,.,.,.,,,,,,.,,,..", "num_bases": 20, "ref_base": 65}, {"bases": ".,,.,.,.,,,,,,.,,,..", "num_bases": 20, "ref_base": 67}, {"bases": ".,,.,.,.,,,,,,,..", "num_bases": 17, "ref_base": 84}, {"bases": ",,.,.,,,,,..", "num_bases": 12, "ref_base": 67}, {"bases": ",,,.,,,,..", "num_bases": 10, "ref_base": 84}, {"bases": ",,,,,,..", "num_bases": 8, "ref_base": 67}, {"bases": ",,,,,..", "num_bases": 7, "ref_base": 84}, {"bases": ",,,.", "num_bases": 4, "ref_base": 71}], "node_id": "1"}]} diff --git a/test/pileup/tiny.gam b/test/pileup/tiny.gam new file mode 100644 index 00000000000..dbc890c9772 Binary files /dev/null and b/test/pileup/tiny.gam differ diff --git a/test/pileup/tiny.vgpu.json b/test/pileup/tiny.vgpu.json new file mode 100644 index 00000000000..dbb035d8f3b --- /dev/null +++ b/test/pileup/tiny.vgpu.json @@ -0,0 +1,4 @@ +{"edge_pileups": [{"edge": {"from": "12", "to": "14"}, "num_forward_reads": 1, "num_reads": 4}, {"edge": {"from": "10", "to": "12"}, "num_forward_reads": 3, "num_reads": 5}, {"edge": {"from": "12", "to": "13"}, "num_forward_reads": 3, "num_reads": 5}, {"edge": {"from": "2", "to": "5"}, "num_forward_reads": 1, "num_reads": 2}, {"edge": {"from": "8", "to": "9"}, "num_forward_reads": 5, "num_reads": 8}], "node_pileups": [{"base_pileup": [{"bases": ".,,,.,", "num_bases": 6, "ref_base": 67}], "node_id": "5"}, {"base_pileup": [{"bases": "....,", "num_bases": 5, "ref_base": 65}], "node_id": "7"}, {"base_pileup": [{"bases": ",,..,.,,.", "num_bases": 9, "ref_base": 65}, {"bases": ",,..,,.,,.", "num_bases": 10, "ref_base": 84}, {"bases": ",,..,,,,,", "num_bases": 9, "ref_base": 65}, {"bases": ",,...,.,,,", "num_bases": 10, "ref_base": 84}], "node_id": "12"}, {"base_pileup": [{"bases": ",,....,,.", "num_bases": 9, "ref_base": 71}], "node_id": "8"}, {"base_pileup": [{"bases": ".", "num_bases": 1, "ref_base": 67}, {"bases": "...", "num_bases": 3, "ref_base": 65}, {"bases": "..,.", "num_bases": 4, "ref_base": 65}, {"bases": "..,.", "num_bases": 4, "ref_base": 65}, {"bases": ",..,.", "num_bases": 5, "ref_base": 84}, {"bases": ",..,,.", "num_bases": 6, "ref_base": 65}, {"bases": ",..,,,.", "num_bases": 7, "ref_base": 65}, {"bases": ",...,,,.,", "num_bases": 9, "ref_base": 71}], "node_id": "1"}]} +{"edge_pileups": [{"edge": {"from": "7", "to": "9"}, "num_forward_reads": 4, "num_reads": 5}, {"edge": {"from": "5", "to": "6"}, "num_forward_reads": 1, "num_reads": 5}, {"edge": {"from": "9", "to": "10"}, "num_forward_reads": 3, "num_reads": 6}, {"edge": {"from": "4", "to": "6"}, "num_forward_reads": 4, "num_reads": 6}, {"edge": {"from": "11", "to": "12"}, "num_reads": 2}], "node_pileups": [{"base_pileup": [{"bases": "..,.,", "num_bases": 5, "ref_base": 65}], "node_id": "13"}, {"base_pileup": [{"bases": ".,...,", "num_bases": 6, "ref_base": 84}], "node_id": "4"}, {"base_pileup": [{"bases": ".,.,..,,..,.,", "num_bases": 13, "ref_base": 84}, {"bases": ".,,..,,..,,.", "num_bases": 12, "ref_base": 84}, {"bases": ".,,....,..,,.", "num_bases": 13, "ref_base": 71}], "node_id": "6"}, {"base_pileup": [{"bases": ",,", "num_bases": 2, "ref_base": 84}], "node_id": "11"}, {"base_pileup": [{"bases": ",..,,.", "num_bases": 6, "ref_base": 65}], "node_id": "10"}]} +{"edge_pileups": [{"edge": {"from": "6", "to": "8"}, "num_forward_reads": 5, "num_reads": 9}, {"edge": {"from": "3", "to": "4"}, "num_forward_reads": 3, "num_reads": 4}, {"edge": {"from": "9", "to": "11"}, "num_reads": 2}, {"edge": {"from": "14", "to": "15"}, "num_forward_reads": 1, "num_reads": 3}, {"edge": {"from": "2", "to": "4"}, "num_forward_reads": 1, "num_reads": 2}], "node_pileups": [{"base_pileup": [{"bases": "..,.....,...,.,,.,", "num_bases": 18, "ref_base": 65}, {"bases": "..,........,.,,,.,", "num_bases": 18, "ref_base": 65}, {"bases": "..,,.........,.,,.,", "num_bases": 19, "ref_base": 65}, {"bases": "..,,........,.,,.,", "num_bases": 18, "ref_base": 84}, {"bases": "..,........,.,,.,", "num_bases": 17, "ref_base": 84}, {"bases": "..,.......,.,,.,", "num_bases": 16, "ref_base": 84}, {"bases": "..,......,.,,.,", "num_bases": 15, "ref_base": 84}, {"bases": "..,.......,.,,,", "num_bases": 15, "ref_base": 67}, {"bases": ".,..,.......,,.,,,", "num_bases": 18, "ref_base": 84}, {"bases": ".,..,......,,.,,,", "num_bases": 17, "ref_base": 71}, {"bases": ".,.,.....,,,,", "num_bases": 13, "ref_base": 71}, {"bases": ".,.,.....,,,", "num_bases": 12, "ref_base": 65}, {"bases": ".,......,,,.", "num_bases": 12, "ref_base": 71}, {"bases": ".,..,....,,.", "num_bases": 12, "ref_base": 84}, {"bases": ",..,....,,,.", "num_bases": 12, "ref_base": 84}, {"bases": ",,,...,....,,,.", "num_bases": 15, "ref_base": 67}, {"bases": ",,,...,...,,,.", "num_bases": 14, "ref_base": 84}, {"bases": ",,,..,...,,,.", "num_bases": 13, "ref_base": 65}, {"bases": ",,.,.,,.", "num_bases": 8, "ref_base": 84}], "node_id": "9"}, {"base_pileup": [{"bases": "..,.,,,", "num_bases": 7, "ref_base": 67}, {"bases": "..,.,,,", "num_bases": 7, "ref_base": 67}, {"bases": "..,.,,,", "num_bases": 7, "ref_base": 65}, {"bases": "..,.,,,", "num_bases": 7, "ref_base": 65}, {"bases": "..,.,,,", "num_bases": 7, "ref_base": 67}, {"bases": ".,.,,", "num_bases": 5, "ref_base": 84}, {"bases": "..,,", "num_bases": 4, "ref_base": 67}, {"bases": "..,", "num_bases": 3, "ref_base": 84}, {"bases": ",", "num_bases": 1, "ref_base": 67}, {"ref_base": 84}, {"ref_base": 71}], "node_id": "15"}, {"base_pileup": [{"bases": ".,.,", "num_bases": 4, "ref_base": 65}], "node_id": "2"}, {"base_pileup": [{"bases": ",,.,,", "num_bases": 5, "ref_base": 84}], "node_id": "14"}, {"base_pileup": [{"bases": ".,.,,.,", "num_bases": 7, "ref_base": 71}], "node_id": "3"}]} +{"edge_pileups": [{"edge": {"from": "1", "to": "3"}, "num_forward_reads": 3, "num_reads": 6}, {"edge": {"from": "13", "to": "15"}, "num_forward_reads": 2, "num_reads": 4}, {"edge": {"from": "6", "to": "7"}, "num_forward_reads": 3, "num_reads": 4}, {"edge": {"from": "3", "to": "5"}, "num_reads": 3}, {"edge": {"from": "1", "to": "2"}, "num_forward_reads": 1, "num_reads": 3}]} diff --git a/test/t/17_vg_augment.t b/test/t/17_vg_augment.t index 190fae1ac6c..1b667dfb9b6 100644 --- a/test/t/17_vg_augment.t +++ b/test/t/17_vg_augment.t @@ -6,19 +6,10 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 13 +plan tests 14 vg view -J -v pileup/tiny.json > tiny.vg -# Compare output of pileup on tiny.vg and pileup/alignment.json -# with pileup/truth.json, which has been manually vetted. -# Will also test some vg view functionality. -vg view -J -a -G pileup/alignment.json > alignment.gam -vg augment -a pileup tiny.vg alignment.gam -P tiny.gpu > /dev/null -vg view tiny.gpu -l -j | jq . > tiny.gpu.json -is $(jq --argfile a tiny.gpu.json --argfile b pileup/truth.json -n '($a == $b)') true "vg augment -P produces the expected output for test case on tiny graph." -rm -f alignment.gam tiny.gpu tiny.gpu.json - # Make sure well-supported edits are augmented in vg view -J -a -G pileup/edits.json > edits.gam vg augment -a direct tiny.vg edits.gam -A edits-embedded.gam > augmented.vg @@ -78,4 +69,15 @@ vg sim -l 30 -x 2snp.xg -n 30 -a >2snp.sim vg index -x flat.xg -g flat.gcsa -k 16 flat.vg vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam is $(vg augment flat.vg 2snp.gam -i | vg mod -D - | vg mod -n - | vg view - | grep ^S | wc -l) 7 "editing the graph with many SNP-containing alignments does not introduce duplicate identical nodes" -rm -f flat.vg flat.gcsa flat.xg 2snp.vg 2snp.xg 2snp.sim 2snp.gam + +vg augment flat.vg 2snp.gam | vg view - | grep S | awk '{print $3}' | sort > vg_augment.nodes +vg convert flat.vg -p > flat.pg +vg augment flat.pg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > packed_graph_augment.nodes +diff vg_augment.nodes packed_graph_augment.nodes +is "$?" 0 "augmenting a packed graph produces same results as a vg graph" +vg convert flat.vg -a > flat.hg +vg augment flat.hg 2snp.gam | vg convert -v - | vg view - | grep S | awk '{print $3}' | sort > hash_graph_augment.nodes +diff vg_augment.nodes hash_graph_augment.nodes +is "$?" 0 "augmenting a hash graph produces same results as a vg graph" + +rm -f flat.vg flat.gcsa flat.xg flat.pg flat.hg 2snp.vg 2snp.xg 2snp.sim 2snp.gam vg_augment.nodes packed_graph_augment.nodes hash_graph_augment.nodes diff --git a/test/t/34_vg_pack.t b/test/t/34_vg_pack.t index ff661fa12f6..869217c8330 100644 --- a/test/t/34_vg_pack.t +++ b/test/t/34_vg_pack.t @@ -16,9 +16,11 @@ vg map -g flat.gcsa -x flat.xg -G 2snp.sim -k 8 >2snp.gam vg pack -x flat.xg -o 2snp.gam.cx -g 2snp.gam -e is $(vg pack -x flat.xg -di 2snp.gam.cx -e | tail -n+2 | cut -f 5 | grep -v ^0$ | wc -l) 2 "allele observation packing detects 2 SNPs" -vg augment -a pileup flat.vg 2snp.gam -P 2snp.gam.vgpu >/dev/null -is $(vg view -l 2snp.gam.vgpu| jq '.node_pileups[].base_pileup[] | (.num_bases // 0)' | awk '{ print NR-1, $0 }' | head | md5sum | cut -f 1 -d\ )\ - $(vg pack -x flat.xg -di 2snp.gam.cx -e | awk '{ print $3, $4 }' | tail -n+2 | head | md5sum | cut -f 1 -d\ ) "pileup packs agree with graph coverage" +# we replace the comparison to the pileup output, to a snapshot of what it used to be (as pileup is gone) +vg pack -x flat.xg -o 2snp.gam.snapshot.cx -g pileup/2snp.gam -e +is $(cat pileup/2snp.gam.vgpu.json | jq '.node_pileups[].base_pileup[] | (.num_bases // 0)' | awk '{ print NR-1, $0 }' | head | md5sum | cut -f 1 -d\ )\ + $(vg pack -x flat.xg -di 2snp.gam.snapshot.cx -e | awk '{ print $3, $4 }' | tail -n+2 | head | md5sum | cut -f 1 -d\ ) "pileup packs agree with graph coverage" +rm -f 2snp.gam.snapshot.cx vg pack -x flat.xg -i 2snp.gam.cx -i 2snp.gam.cx -i 2snp.gam.cx -o 2snp.gam.cx.3x @@ -59,10 +61,9 @@ rm -f flat.vg 2snp.vg 2snp.xg 2snp.sim flat.gcsa flat.gcsa.lcp flat.xg 2snp.xg 2 vg construct -r tiny/tiny.fa -v tiny/tiny.vcf.gz > tiny.vg vg index tiny.vg -x tiny.xg -vg sim -l 10 -x tiny.xg -n 50 -a -s 23 > tiny.gam -vg augment -a pileup tiny.vg tiny.gam -P tiny.vgpu > /dev/null -x=$(vg view -l tiny.vgpu | jq '.edge_pileups' | grep num_reads | awk '{print $2}' | sed -e 's/\,//' | awk '{sum+=$1} END {print sum}') -y=$(vg pack -x tiny.xg -g tiny.gam -D -o tiny.pack | grep -v from | awk '{sum+=$5} END {print sum}') +# now that pileup doesnt exist, we use the snapshots in pileup/ instead of recomputing +x=$(cat pileup/tiny.vgpu.json | jq '.edge_pileups' | grep num_reads | awk '{print $2}' | sed -e 's/\,//' | awk '{sum+=$1} END {print sum}') +y=$(vg pack -x tiny.xg -g pileup/tiny.gam -D -o tiny.pack | grep -v from | awk '{sum+=$5} END {print sum}') is $x $y "pack computes the same total edge coverage as pileup" x=$(vg pack -x tiny.xg -i tiny.pack -D | grep -v from | awk '{sum+=$5} END {print sum}') is $x $y "pack stores the correct edge pileup to disk"