From 8b2edeee7f643d055b9ccb73704d27753d8bffda Mon Sep 17 00:00:00 2001 From: Ryan Wick Date: Wed, 8 Jan 2025 16:04:17 +1100 Subject: [PATCH] Change merge logic --- src/clean.rs | 4 ++-- src/cluster.rs | 2 +- src/graph_simplification.rs | 34 +++++++++++++++++++++++++--------- src/resolve.rs | 3 ++- src/trim.rs | 2 +- 5 files changed, 31 insertions(+), 14 deletions(-) diff --git a/src/clean.rs b/src/clean.rs index fcd5ebf..54dee8f 100644 --- a/src/clean.rs +++ b/src/clean.rs @@ -67,7 +67,7 @@ fn finished_message(out_gfa: &Path) { fn clean_graph(graph: &mut UnitigGraph, remove_tigs: &[u32], in_gfa: &Path) { section_header("Cleaning graph"); explanation("The user-specified tigs are now removed from the graph."); - check_tig_nums_are_valid(in_gfa, &graph, remove_tigs); + check_tig_nums_are_valid(in_gfa, graph, remove_tigs); let remove_set: HashSet = HashSet::from_iter(remove_tigs.iter().cloned()); graph.remove_unitigs_by_number(remove_set); graph.print_basic_graph_info(); @@ -77,7 +77,7 @@ fn clean_graph(graph: &mut UnitigGraph, remove_tigs: &[u32], in_gfa: &Path) { fn merge_graph(graph: &mut UnitigGraph) { section_header("Merging linear paths"); explanation("Linear paths in the graph are now merged."); - merge_linear_paths(graph, &vec![], false); + merge_linear_paths(graph, &vec![]); graph.print_basic_graph_info(); graph.renumber_unitigs(); } diff --git a/src/cluster.rs b/src/cluster.rs index fa73695..d7f2d6d 100644 --- a/src/cluster.rs +++ b/src/cluster.rs @@ -793,7 +793,7 @@ fn save_cluster_gfa(sequences: &[Sequence], cluster_num: u16, gfa_lines: &Vec Vec { } -pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, require_same_depth: bool) { +pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec) { // This function looks for linear paths in the graph (where one Unitig leads only to another // and vice versa) and merges them together when possible. // @@ -343,12 +344,10 @@ pub fn merge_linear_paths(graph: &mut UnitigGraph, seqs: &Vec, require // Extend the path as far as possible. loop { let unitig = current_path.last().unwrap(); - let current_depth = unitig.depth(); if cannot_merge_end(unitig.number(), unitig.strand, &fixed_starts, &fixed_ends) { break; } let mut outputs = if unitig.strand { get_exclusive_outputs(&unitig.unitig) } else { get_exclusive_inputs(&unitig.unitig) }; if outputs.len() != 1 { break; } let output = &mut outputs[0]; - if require_same_depth && output.depth() != current_depth { break; } if !unitig.strand { output.strand = !output.strand; } let output_number = output.number(); if already_used.contains(&output_number) { break; } @@ -433,8 +432,7 @@ fn merge_path(graph: &mut UnitigGraph, path: &Vec, new_unitig_numb number: new_unitig_number, reverse_seq: reverse_complement(&merged_seq), forward_seq: merged_seq, - depth: if !forward_positions.is_empty() { forward_positions.len() as f64 } - else { weighted_mean_depth(path) }, + depth: get_merge_path_depth(path, &forward_positions), forward_positions, reverse_positions, forward_next, forward_prev, reverse_next, reverse_prev, ..Default::default() @@ -502,6 +500,24 @@ fn merge_unitig_seqs(path: &Vec) -> Vec { } +fn get_merge_path_depth(path: &Vec, forward_positions: &[Position]) -> f64 { + // If the unitigs have position information, use that to determine depth. + if !forward_positions.is_empty() { + return forward_positions.len() as f64; + } + + // If the path contains an anchor unitig, set the merged depth to the anchor's depth. + for u in path { + if u.anchor() { + return u.depth(); + } + } + + // Otherwise, give a weighted mean depth of the unitigs in the path. + weighted_mean_depth(path) +} + + fn weighted_mean_depth(path: &Vec) -> f64 { let total_length = path.iter().map(|u| u.length()).sum::() as f64; let mut depth_sum = 0.0; @@ -729,7 +745,7 @@ mod tests { fn test_merge_linear_paths_1() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_3()); assert_eq!(graph.unitigs.len(), 7); - merge_linear_paths(&mut graph, &seqs, false); + merge_linear_paths(&mut graph, &seqs); assert_eq!(graph.unitigs.len(), 3); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&8).unwrap().borrow().forward_seq).unwrap(), "TTCGCTGCGCTCGCTTCGCTTTTGCACAGCGACGACGGCATGCCTGAATCGCCTA"); @@ -753,7 +769,7 @@ mod tests { fn test_merge_linear_paths_2() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_4()); assert_eq!(graph.unitigs.len(), 5); - merge_linear_paths(&mut graph, &seqs, false); + merge_linear_paths(&mut graph, &seqs); assert_eq!(graph.unitigs.len(), 2); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&6).unwrap().borrow().forward_seq).unwrap(), "ACGACTACGAGCACGAGTCGTCGTCGTAACTGACT"); @@ -772,7 +788,7 @@ mod tests { fn test_merge_linear_paths_3() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_5()); assert_eq!(graph.unitigs.len(), 6); - merge_linear_paths(&mut graph, &seqs, false); + merge_linear_paths(&mut graph, &seqs); assert_eq!(graph.unitigs.len(), 5); assert_eq!(std::str::from_utf8(&graph.unitig_index.get(&7).unwrap().borrow().forward_seq).unwrap(), "AAATGCGACTGTG"); @@ -782,7 +798,7 @@ mod tests { fn test_merge_linear_paths_4() { let (mut graph, seqs) = UnitigGraph::from_gfa_lines(&get_test_gfa_14()); assert_eq!(graph.unitigs.len(), 13); - merge_linear_paths(&mut graph, &seqs, false); + merge_linear_paths(&mut graph, &seqs); assert_eq!(graph.unitigs.len(), 11); } } diff --git a/src/resolve.rs b/src/resolve.rs index fdcd54b..f707071 100644 --- a/src/resolve.rs +++ b/src/resolve.rs @@ -232,13 +232,14 @@ fn apply_bridges(graph: &mut UnitigGraph, bridges: &Vec, bridge_depth: f } } delete_unitigs_not_connected_to_anchor(graph); + graph.remove_zero_depth_unitigs(); // TODO: add logic for removing non-anchor tips to handle blunt ends? } fn merge_after_bridging(graph: &mut UnitigGraph) { - merge_linear_paths(graph, &vec![], true); + merge_linear_paths(graph, &vec![]); graph.print_basic_graph_info(); graph.renumber_unitigs(); } diff --git a/src/trim.rs b/src/trim.rs index 01f4ed1..4a23ad3 100644 --- a/src/trim.rs +++ b/src/trim.rs @@ -267,7 +267,7 @@ fn clean_up_graph(graph: &mut UnitigGraph, sequences: &Vec) { has occurred above."); graph.recalculate_depths(); graph.remove_zero_depth_unitigs(); - merge_linear_paths(graph, sequences, false); + merge_linear_paths(graph, sequences); graph.print_basic_graph_info(); graph.renumber_unitigs(); }