diff --git a/src/algorithms/0_get_parallel_normalize_regions.cpp b/src/algorithms/0_get_parallel_normalize_regions.cpp index db9c84a0981..076f0708f80 100644 --- a/src/algorithms/0_get_parallel_normalize_regions.cpp +++ b/src/algorithms/0_get_parallel_normalize_regions.cpp @@ -144,31 +144,31 @@ vector> NormalizeRegionFinder::split_sources_and_sinks(vector

steps_0 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(leftmost_handle))); - for (step_handle_t step : steps_0) - { - std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; - } - //todo: end debug_code + // //todo: begin debug_code + // std::cerr << "looking at original leftmost_handle: " << _graph.get_id(leftmost_handle) << endl; + // vector steps_0 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(leftmost_handle))); + // for (step_handle_t step : steps_0) + // { + // std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; + // } + // //todo: end debug_code pair new_leftmosts = _graph.divide_handle(leftmost_handle, _graph.get_sequence(leftmost_handle).size()/2); - //todo: begin debug_code - std::cerr << "looking at handle new_leftmosts.first: " << _graph.get_id(new_leftmosts.first) << endl; - vector steps_1 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.first))); - for (step_handle_t step : steps_1) - { - std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; - } - std::cerr << "looking at handle new_leftmosts.second: " << _graph.get_id(new_leftmosts.second) << endl; - vector steps_2 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.second))); - for (step_handle_t step : steps_2) - { - std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; - } + // //todo: begin debug_code + // std::cerr << "looking at handle new_leftmosts.first: " << _graph.get_id(new_leftmosts.first) << endl; + // vector steps_1 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.first))); + // for (step_handle_t step : steps_1) + // { + // std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; + // } + // std::cerr << "looking at handle new_leftmosts.second: " << _graph.get_id(new_leftmosts.second) << endl; + // vector steps_2 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.second))); + // for (step_handle_t step : steps_2) + // { + // std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl; + // } - //todo: end debug_code + // //todo: end debug_code desegregation_candidates.push_back(make_pair(make_pair(_graph.get_id(new_leftmosts.first), _graph.get_id(new_leftmosts.second)), original_leftmost)); diff --git a/src/algorithms/0_oo_normalize_snarls.cpp b/src/algorithms/0_oo_normalize_snarls.cpp index d8cdf2fcfed..03c1b9f5fbe 100644 --- a/src/algorithms/0_oo_normalize_snarls.cpp +++ b/src/algorithms/0_oo_normalize_snarls.cpp @@ -732,6 +732,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair vector>& source_to_sink_gbwt_paths, const bool stop_inclusive/*=false*/) { + // cerr << "inside extract_haplotypes." << endl; bool backwards = false; // if (_debug_print == true) @@ -840,7 +841,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair // } // //todo: end debug_print // cerr << "hap original text: " << *get<0>(haplotypes).begin() << endl; - // check to see if the leftmost or rightmost node is empty. If so, treat the blank + // check to see if the leftmost node is empty. If so, treat the blank // node as containing a character "A". (this is important for dealing with how // get_parallel_normalize_regions sometimes adds a blank node when isolating adjacent // snarls for parallelization). @@ -870,23 +871,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair // cerr << hap << endl; // } } - // else if (_nodes_to_delete.find(region.first) != _nodes_to_delete.end()) - // { - // cerr << "earlier error of the two. " << endl; - // cerr << "here is the node to delete: " << region.second << endl; - // cerr << "here is the _nodes_to_delete.size(): " << _nodes_to_delete.size() << endl; - // cerr << "here is the _nodes_to_delete: " <(haplotypes).emplace(reverse_complement(path_seq)); @@ -1012,7 +1021,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair } else { - //this path goes halfway across the graph, and thus must be dropped. + //this path goes part-way across the graph, and thus must be dropped. //todo: actually implement this. Currently just _debug_code. // paths_to_delete.emplace(path); // string deleted_path = _graph.get_path_name(_graph.get_path_handle_of_step(path.first)); diff --git a/src/subcommand/0_normalize_main.cpp b/src/subcommand/0_normalize_main.cpp index 315a272e2c6..61c8d90fc8c 100644 --- a/src/subcommand/0_normalize_main.cpp +++ b/src/subcommand/0_normalize_main.cpp @@ -415,6 +415,7 @@ int main_normalize(int argc, char **argv) { //todo: find way to load gbwtgraph's unique pointer without saving and then reloading file. gbwt_graph = vg::io::VPKG::load_one(gbwt_graph_output_file); gbwt_graph->set_gbwt(*gbwt); + gbwt_graph_file = gbwt_graph_output_file; } else { @@ -539,6 +540,26 @@ int main_normalize(int argc, char **argv) { // //todo: end debug-code: parallel_regions_gbwt_updates = region_finder.get_parallel_normalize_regions(snarl_roots, *distance_index, parallel_normalize_regions, desegregation_candidates, segregated_node_to_parent); + + // if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests + // { + // //visualize every node after get_parallel_normalize_regions. + // cerr << "here are the nodes after get_parallel_normalize_regions" << endl; + // graph->for_each_handle([&](handle_t handle){ + // cerr << "node: " << graph->get_id(handle) << " " << graph->get_sequence(handle) << "\tneighbors left: " ; + // graph->follow_edges(handle, true, [&](handle_t left){ + // cerr << graph->get_id(left) << " "; + + // }); + // cerr << "\tright: "; + // graph->follow_edges(handle, false, [&](handle_t right){ + // cerr << graph->get_id(right) << " "; + // }); + // cerr << endl; + // }); + // } + + cerr << "found " << parallel_normalize_regions.size() << " regions to normalize." << endl; if (run_tests) { @@ -718,6 +739,7 @@ int main_normalize(int argc, char **argv) { if (!parallel_regions_gbwt_updates.size()==0) { gbwt.reset(); + gbwt_graph.reset(); } // =======running normalize======= @@ -788,6 +810,14 @@ int main_normalize(int argc, char **argv) { cerr << "max_region_gap (-n): " << max_region_gap << endl; cerr << "threads (-t): " << threads << endl; + // if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests + // { + // cerr << "see every node in graph after normalization but before segregation: " << endl; + // graph->for_each_handle([&](handle_t handle){ + // cerr << "id: " << graph->get_id(handle) << " seq: " << graph->get_sequence(handle) << " length: " << graph->get_length(handle) << endl; + // }); + // } + // cerr << "=======updating gbwt after normalization and before desegregating regions=======" << endl; // cerr << "updating gbwt after normalization" << endl; // auto _post_norm_gbwt_update_start = chrono::high_resolution_clock::now(); @@ -801,6 +831,27 @@ int main_normalize(int argc, char **argv) { // // make a new gbwt_graph for the normalized_gbwt. // gbwtgraph::GBWTGraph normalized_gbwt_graph = gbwtgraph::GBWTGraph(normalized_gbwt, *graph); + // if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests + // { + // //visualize every update to the gbwt, and see if it properly accounts for what's going to happen with desegregation candidates. + // for (vg::RebuildJob::mapping_type update : gbwt_normalize_updates) + // { + // cerr << "original sequence of nodes: "; + // for (auto node : update.first) + // { + // cerr << gbwt::Node::id(node) << " "; + // } + // cerr << endl; + // cerr << "new sequence of nodes: "; + // for (auto node : update.second) + // { + // cerr << gbwt::Node::id(node) << " "; + // } + // cerr << endl; + // } + // } + + cerr << "=======desegregating normalization regions after parallelized normalization=======" << endl; cerr << "desegregating the normalize regions." << endl; vg::algorithms::NormalizeRegionFinder post_norm_region_finder = vg::algorithms::NormalizeRegionFinder(*graph, max_region_size, max_region_gap); @@ -808,15 +859,49 @@ int main_normalize(int argc, char **argv) { // std::vector desegregated_regions_gbwt_updates = post_norm_region_finder.desegregate_nodes(desegregation_candidates); post_norm_region_finder.desegregate_nodes(desegregation_candidates); + if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests + { + //check to see if every node in the graph can be accessed, and is not empty. + // cerr << "after desegregation: " << endl; + graph->for_each_handle([&](handle_t handle){ + if (graph->get_length(handle) == 0) + { + cerr << "ERROR: a node is of length zero after normalization." << endl; + cerr << "the graph will be saved for debugging purposes, but is essentially invalid." << endl; + } + // cerr << " graph->get_id(handle): " << graph->get_id(handle) << " graph->get_sequence(handle): " << graph->get_sequence(handle) << " graph->get_length(handle): " << graph->get_length(handle) << endl; + // cerr << "left neighbors: " << endl << "\t"; + // graph->follow_edges(handle, true, [&](handle_t left){ + // cerr << graph->get_id(left) << " "; + + // }); + // cerr << endl; + }); + } + cerr << "======preparing and saving output=======" << endl; cerr << "saving updated graph to file" << endl; //save normalized graph vg::io::save_handle_graph(graph.get(), std::cout); + //todo: make this code fit so I can update the original gbwt. + //todo: also reset any existing gbwts that aren't the original gbwt, since we don't need them anymore. + //re-load the original gbwt and gbwt graph for the apply_gbwt_changelog function + // gbwt + cerr << "loading original gbwt" << endl; + ifstream gbwt_stream_2; + gbwt_stream_2.open(gbwt_file); + gbwt = vg::io::VPKG::load_one(gbwt_stream_2); + + // gbwt graph + cerr << "loading original gbwt graph" << endl; + gbwt_graph = vg::io::VPKG::load_one(gbwt_graph_file); + gbwt_graph->set_gbwt(*gbwt); + cerr << "updating gbwt after de-isolation." << endl; - auto _desegregated_regions_gbwt_update_start = chrono::high_resolution_clock::now(); - gbwt::GBWT normalized_desegregated_gbwt = vg::algorithms::apply_gbwt_changelog(parallel_regions_gbwt_graph, gbwt_normalize_updates, parallel_regions_gbwt, gbwt_threads, false); + auto _desegregated_regions_gbwt_update_start = chrono::high_resolution_clock::now(); + gbwt::GBWT normalized_desegregated_gbwt = vg::algorithms::apply_gbwt_changelog(*gbwt_graph, gbwt_normalize_updates, *gbwt, gbwt_threads, false); auto _desegregated_regions_gbwt_update_end = chrono::high_resolution_clock::now(); chrono::duration _desegregated_regions_gbwt_update_elapsed = _desegregated_regions_gbwt_update_end - _desegregated_regions_gbwt_update_start; @@ -825,6 +910,11 @@ int main_normalize(int argc, char **argv) { cerr << "saving updated gbwt" << endl; save_gbwt(normalized_desegregated_gbwt, output_gbwt_file, true); + cerr << "generating and saving the corresponding gbwt graph" << endl; + // make a new gbwt_graph for the normalized_gbwt. + gbwtgraph::GBWTGraph normalized_desegregated_gbwt_graph = gbwtgraph::GBWTGraph(normalized_desegregated_gbwt, *graph); + save_gbwtgraph(normalized_desegregated_gbwt_graph, output_gbwt_file + ".gg", true); + // Record end time auto finish = std::chrono::high_resolution_clock::now(); chrono::duration elapsed = finish - start;