Skip to content

Commit

Permalink
made the gbwt-update 2-step (instead of 3-step) process work now, and…
Browse files Browse the repository at this point in the history
… fixed a bug where embedded paths through the snarl were not being integrated into the alignment properly when empty nodes were present.
  • Loading branch information
Robin-Rounthwaite committed Apr 4, 2024
1 parent 461990a commit 6d064a4
Show file tree
Hide file tree
Showing 3 changed files with 144 additions and 45 deletions.
44 changes: 22 additions & 22 deletions src/algorithms/0_get_parallel_normalize_regions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,31 +144,31 @@ vector<pair<id_t, id_t>> NormalizeRegionFinder::split_sources_and_sinks(vector<p
return false;
});

//todo: begin debug_code
std::cerr << "looking at original leftmost_handle: " << _graph.get_id(leftmost_handle) << endl;
vector<step_handle_t> steps_0 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(leftmost_handle)));
for (step_handle_t step : steps_0)
{
std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
}
//todo: end debug_code
// //todo: begin debug_code
// std::cerr << "looking at original leftmost_handle: " << _graph.get_id(leftmost_handle) << endl;
// vector<step_handle_t> steps_0 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(leftmost_handle)));
// for (step_handle_t step : steps_0)
// {
// std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
// }
// //todo: end debug_code

pair<handle_t, handle_t> new_leftmosts = _graph.divide_handle(leftmost_handle, _graph.get_sequence(leftmost_handle).size()/2);
//todo: begin debug_code
std::cerr << "looking at handle new_leftmosts.first: " << _graph.get_id(new_leftmosts.first) << endl;
vector<step_handle_t> steps_1 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.first)));
for (step_handle_t step : steps_1)
{
std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
}
std::cerr << "looking at handle new_leftmosts.second: " << _graph.get_id(new_leftmosts.second) << endl;
vector<step_handle_t> steps_2 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.second)));
for (step_handle_t step : steps_2)
{
std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
}
// //todo: begin debug_code
// std::cerr << "looking at handle new_leftmosts.first: " << _graph.get_id(new_leftmosts.first) << endl;
// vector<step_handle_t> steps_1 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.first)));
// for (step_handle_t step : steps_1)
// {
// std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
// }
// std::cerr << "looking at handle new_leftmosts.second: " << _graph.get_id(new_leftmosts.second) << endl;
// vector<step_handle_t> steps_2 = _graph.steps_of_handle(_graph.get_handle(_graph.get_id(new_leftmosts.second)));
// for (step_handle_t step : steps_2)
// {
// std::cerr << _graph.get_path_name(_graph.get_path_handle_of_step(step)) << " on " << _graph.get_id(_graph.get_handle_of_step(step)) << endl;
// }

//todo: end debug_code
// //todo: end debug_code


desegregation_candidates.push_back(make_pair(make_pair(_graph.get_id(new_leftmosts.first), _graph.get_id(new_leftmosts.second)), original_leftmost));
Expand Down
51 changes: 30 additions & 21 deletions src/algorithms/0_oo_normalize_snarls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,6 +732,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair
vector<pair<gbwt::vector_type, string>>& source_to_sink_gbwt_paths,
const bool stop_inclusive/*=false*/)
{
// cerr << "inside extract_haplotypes." << endl;
bool backwards = false;

// if (_debug_print == true)
Expand Down Expand Up @@ -840,7 +841,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair
// }
// //todo: end debug_print
// cerr << "hap original text: " << *get<0>(haplotypes).begin() << endl;
// check to see if the leftmost or rightmost node is empty. If so, treat the blank
// check to see if the leftmost node is empty. If so, treat the blank
// node as containing a character "A". (this is important for dealing with how
// get_parallel_normalize_regions sometimes adds a blank node when isolating adjacent
// snarls for parallelization).
Expand Down Expand Up @@ -870,23 +871,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair
// cerr << hap << endl;
// }
}
// else if (_nodes_to_delete.find(region.first) != _nodes_to_delete.end())
// {
// cerr << "earlier error of the two. " << endl;
// cerr << "here is the node to delete: " << region.second << endl;
// cerr << "here is the _nodes_to_delete.size(): " << _nodes_to_delete.size() << endl;
// cerr << "here is the _nodes_to_delete: " <<endl;
// for (auto node : _nodes_to_delete)
// {
// cerr << node << " ";
// }
// cerr << endl;
// cerr << "ERROR: found a node_to_delete that isn't of length zero. This shouldn't happen." << endl;
// exit(1);
// }
// cerr << region.first << " " << region.second << " in extract_haplotypes 4" << endl;

// check to see if the leftmost or rightmost node is empty. If so, treat the blank
// check to see if rightmost node is empty. If so, treat the blank
// node as containing a character "A". (this is important for dealing with how
// get_parallel_normalize_regions sometimes adds a blank node when isolating adjacent
// snarls for parallelization).
Expand Down Expand Up @@ -994,13 +979,37 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair
// get the sequence of the source to sink path, and add it to the
// paths to be aligned.
string path_seq;
step_handle_t prev_step = _graph.get_previous_step(path.first);
// cerr << "here is 'path.first': " << _graph.get_id(_graph.get_handle_of_step(path.first)) << endl;
// cerr << "here is path name: " << _graph.get_path_name(_graph.get_path_handle_of_step(path.first)) << endl;
step_handle_t cur_step = path.first;
while (_graph.get_id(_graph.get_handle_of_step(prev_step)) != region.second) { //that is, stop after we have ran the while loop on the subgraph sink
step_handle_t prev_step = _graph.get_previous_step(cur_step);



// while (!_graph.has_previous_step(cur_step) || _graph.get_id(_graph.get_handle_of_step(prev_step)) != region.second) { //that is, stop after we have ran the while loop on the subgraph sink
while (cur_step==_graph.path_begin(_graph.get_path_handle_of_step(cur_step)) || _graph.get_id(_graph.get_handle_of_step(prev_step)) != region.second) { //that is, stop after we have ran the while loop on the subgraph sink
// cerr << "!_graph.has_previous_step(cur_step) " << !_graph.has_previous_step(cur_step) << endl;
// if (_graph.has_previous_step(cur_step))
// {
// cerr << "!_graph.has_previous_step(cur_step) " << !_graph.has_previous_step(cur_step) << " _graph.get_id(_graph.get_handle_of_step(prev_step)) != region.second: " << (_graph.get_id(_graph.get_handle_of_step(prev_step)) != region.second) << " _graph.get_id(_graph.get_handle_of_step(prev_step)) " << _graph.get_id(_graph.get_handle_of_step(prev_step)) << " region.second " << region.second << endl;
// }
// cerr << "here is cur_step: " << _graph.get_id(_graph.get_handle_of_step(cur_step)) << " " << _graph.get_sequence(_graph.get_handle_of_step(cur_step)) << endl;
path_seq += _graph.get_sequence(_graph.get_handle_of_step(cur_step));
prev_step = cur_step;
cur_step = _graph.get_next_step(cur_step);
}
if (_graph.get_sequence(_graph.get_handle(region.second)).size() == 0)
{
//If the rightmost node is empty, append an A to the end to ensure that the empty node is represented. This will be removed before realigned subgraph is inserted into the graph.
//todo: before writing this line, see if there's a better place to add it. Don't I add it to all haplotypes at the same time somewhere else (presumably accidentally before I get embedded paths?).
path_seq += "A";
}
if (_graph.get_sequence(_graph.get_handle(region.first)).size() == 0)
{
//If the rightmost node is empty, append an A to the end to ensure that the empty node is represented. This will be removed before realigned subgraph is inserted into the graph.
//todo: before writing this line, see if there's a better place to add it. Don't I add it to all haplotypes at the same time somewhere else (presumably accidentally before I get embedded paths?).
path_seq = "A" + path_seq;
}
if (backwards) //guaranteed false, here. (but not where code was copied from)
{
get<0>(haplotypes).emplace(reverse_complement(path_seq));
Expand All @@ -1012,7 +1021,7 @@ void SnarlNormalizer::extract_haplotypes(const SubHandleGraph& snarl, const pair
}
else
{
//this path goes halfway across the graph, and thus must be dropped.
//this path goes part-way across the graph, and thus must be dropped.
//todo: actually implement this. Currently just _debug_code.
// paths_to_delete.emplace(path);
// string deleted_path = _graph.get_path_name(_graph.get_path_handle_of_step(path.first));
Expand Down
94 changes: 92 additions & 2 deletions src/subcommand/0_normalize_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ int main_normalize(int argc, char **argv) {
//todo: find way to load gbwtgraph's unique pointer without saving and then reloading file.
gbwt_graph = vg::io::VPKG::load_one<gbwtgraph::GBWTGraph>(gbwt_graph_output_file);
gbwt_graph->set_gbwt(*gbwt);
gbwt_graph_file = gbwt_graph_output_file;
}
else
{
Expand Down Expand Up @@ -539,6 +540,26 @@ int main_normalize(int argc, char **argv) {
// //todo: end debug-code:
parallel_regions_gbwt_updates = region_finder.get_parallel_normalize_regions(snarl_roots, *distance_index, parallel_normalize_regions, desegregation_candidates, segregated_node_to_parent);


// if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests
// {
// //visualize every node after get_parallel_normalize_regions.
// cerr << "here are the nodes after get_parallel_normalize_regions" << endl;
// graph->for_each_handle([&](handle_t handle){
// cerr << "node: " << graph->get_id(handle) << " " << graph->get_sequence(handle) << "\tneighbors left: " ;
// graph->follow_edges(handle, true, [&](handle_t left){
// cerr << graph->get_id(left) << " ";

// });
// cerr << "\tright: ";
// graph->follow_edges(handle, false, [&](handle_t right){
// cerr << graph->get_id(right) << " ";
// });
// cerr << endl;
// });
// }


cerr << "found " << parallel_normalize_regions.size() << " regions to normalize." << endl;
if (run_tests)
{
Expand Down Expand Up @@ -718,6 +739,7 @@ int main_normalize(int argc, char **argv) {
if (!parallel_regions_gbwt_updates.size()==0)
{
gbwt.reset();
gbwt_graph.reset();
}

// =======running normalize=======
Expand Down Expand Up @@ -788,6 +810,14 @@ int main_normalize(int argc, char **argv) {
cerr << "max_region_gap (-n): " << max_region_gap << endl;
cerr << "threads (-t): " << threads << endl;

// if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests
// {
// cerr << "see every node in graph after normalization but before segregation: " << endl;
// graph->for_each_handle([&](handle_t handle){
// cerr << "id: " << graph->get_id(handle) << " seq: " << graph->get_sequence(handle) << " length: " << graph->get_length(handle) << endl;
// });
// }

// cerr << "=======updating gbwt after normalization and before desegregating regions=======" << endl;
// cerr << "updating gbwt after normalization" << endl;
// auto _post_norm_gbwt_update_start = chrono::high_resolution_clock::now();
Expand All @@ -801,22 +831,77 @@ int main_normalize(int argc, char **argv) {
// // make a new gbwt_graph for the normalized_gbwt.
// gbwtgraph::GBWTGraph normalized_gbwt_graph = gbwtgraph::GBWTGraph(normalized_gbwt, *graph);

// if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests
// {
// //visualize every update to the gbwt, and see if it properly accounts for what's going to happen with desegregation candidates.
// for (vg::RebuildJob::mapping_type update : gbwt_normalize_updates)
// {
// cerr << "original sequence of nodes: ";
// for (auto node : update.first)
// {
// cerr << gbwt::Node::id(node) << " ";
// }
// cerr << endl;
// cerr << "new sequence of nodes: ";
// for (auto node : update.second)
// {
// cerr << gbwt::Node::id(node) << " ";
// }
// cerr << endl;
// }
// }


cerr << "=======desegregating normalization regions after parallelized normalization=======" << endl;
cerr << "desegregating the normalize regions." << endl;
vg::algorithms::NormalizeRegionFinder post_norm_region_finder = vg::algorithms::NormalizeRegionFinder(*graph, max_region_size, max_region_gap);
//merges nodes, updates entries in gbwt_normalize to match those merged nodes. (note: is there a way to make this O(n), rather than O(n^2)? Maybe a reverse index... seems possibly a distraction. Could be worth just running desegregate nodes after updating the gbwt, and update the gbwt a third time.)
// std::vector<vg::RebuildJob::mapping_type> desegregated_regions_gbwt_updates = post_norm_region_finder.desegregate_nodes(desegregation_candidates);
post_norm_region_finder.desegregate_nodes(desegregation_candidates);

if (run_tests && graph->max_node_id() < 100) //essentially just for tiny tests
{
//check to see if every node in the graph can be accessed, and is not empty.
// cerr << "after desegregation: " << endl;
graph->for_each_handle([&](handle_t handle){
if (graph->get_length(handle) == 0)
{
cerr << "ERROR: a node is of length zero after normalization." << endl;
cerr << "the graph will be saved for debugging purposes, but is essentially invalid." << endl;
}
// cerr << " graph->get_id(handle): " << graph->get_id(handle) << " graph->get_sequence(handle): " << graph->get_sequence(handle) << " graph->get_length(handle): " << graph->get_length(handle) << endl;
// cerr << "left neighbors: " << endl << "\t";
// graph->follow_edges(handle, true, [&](handle_t left){
// cerr << graph->get_id(left) << " ";

// });
// cerr << endl;
});
}

cerr << "======preparing and saving output=======" << endl;

cerr << "saving updated graph to file" << endl;
//save normalized graph
vg::io::save_handle_graph(graph.get(), std::cout);

//todo: make this code fit so I can update the original gbwt.
//todo: also reset any existing gbwts that aren't the original gbwt, since we don't need them anymore.
//re-load the original gbwt and gbwt graph for the apply_gbwt_changelog function
// gbwt
cerr << "loading original gbwt" << endl;
ifstream gbwt_stream_2;
gbwt_stream_2.open(gbwt_file);
gbwt = vg::io::VPKG::load_one<gbwt::GBWT>(gbwt_stream_2);

// gbwt graph
cerr << "loading original gbwt graph" << endl;
gbwt_graph = vg::io::VPKG::load_one<gbwtgraph::GBWTGraph>(gbwt_graph_file);
gbwt_graph->set_gbwt(*gbwt);

cerr << "updating gbwt after de-isolation." << endl;
auto _desegregated_regions_gbwt_update_start = chrono::high_resolution_clock::now();
gbwt::GBWT normalized_desegregated_gbwt = vg::algorithms::apply_gbwt_changelog(parallel_regions_gbwt_graph, gbwt_normalize_updates, parallel_regions_gbwt, gbwt_threads, false);
auto _desegregated_regions_gbwt_update_start = chrono::high_resolution_clock::now();
gbwt::GBWT normalized_desegregated_gbwt = vg::algorithms::apply_gbwt_changelog(*gbwt_graph, gbwt_normalize_updates, *gbwt, gbwt_threads, false);

auto _desegregated_regions_gbwt_update_end = chrono::high_resolution_clock::now();
chrono::duration<double> _desegregated_regions_gbwt_update_elapsed = _desegregated_regions_gbwt_update_end - _desegregated_regions_gbwt_update_start;
Expand All @@ -825,6 +910,11 @@ int main_normalize(int argc, char **argv) {
cerr << "saving updated gbwt" << endl;
save_gbwt(normalized_desegregated_gbwt, output_gbwt_file, true);

cerr << "generating and saving the corresponding gbwt graph" << endl;
// make a new gbwt_graph for the normalized_gbwt.
gbwtgraph::GBWTGraph normalized_desegregated_gbwt_graph = gbwtgraph::GBWTGraph(normalized_desegregated_gbwt, *graph);
save_gbwtgraph(normalized_desegregated_gbwt_graph, output_gbwt_file + ".gg", true);

// Record end time
auto finish = std::chrono::high_resolution_clock::now();
chrono::duration<double> elapsed = finish - start;
Expand Down

0 comments on commit 6d064a4

Please sign in to comment.