diff --git a/src/derive/command/junction_annotation.rs b/src/derive/command/junction_annotation.rs index 89e8cc2..fa1d41b 100644 --- a/src/derive/command/junction_annotation.rs +++ b/src/derive/command/junction_annotation.rs @@ -1,6 +1,7 @@ //! Functionality relating to the `ngs derive junction_annotation` subcommand itself. use std::collections::HashMap; +use std::collections::HashSet; use std::path::PathBuf; use anyhow::Context; @@ -37,13 +38,9 @@ pub struct JunctionAnnotationArgs { #[arg(short = 'i', long, value_name = "USIZE", default_value = "50")] pub min_intron_length: usize, - /// Add +- this amount to intron positions when looking up exon positions. - #[arg(short = 'k', long, value_name = "U8", default_value = "0")] - pub fuzzy_junction_match_range: u8, - /// Minimum number of reads supporting a junction to be considered. - #[arg(short = 'r', long, value_name = "U8", default_value = "2")] - pub min_read_support: u8, + #[arg(short = 'r', long, value_name = "usize", default_value = "2")] + pub min_read_support: usize, /// Minumum mapping quality for a record to be considered. /// Set to 0 to disable this filter and allow reads _without_ @@ -68,8 +65,8 @@ pub struct JunctionAnnotationArgs { pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> { info!("Starting derive junction_annotation subcommand."); - let mut exon_starts: HashMap<&str, Vec> = HashMap::new(); - let mut exon_ends: HashMap<&str, Vec> = HashMap::new(); + let mut exon_starts: HashMap<&str, HashSet> = HashMap::new(); + let mut exon_ends: HashMap<&str, HashSet> = HashMap::new(); // (1) Parse the GFF file and collect all exon features. debug!("Reading all records in GFF."); @@ -91,18 +88,8 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> { let start: usize = record.start().into(); let end: usize = record.end().into(); - exon_starts.entry(seq_name).or_default().push(start); - exon_ends.entry(seq_name).or_default().push(end + 1); // TODO why +1? It works - } - - debug!("Finalizing GFF features lookup."); - for starts in exon_starts.values_mut() { - starts.sort_unstable(); - starts.dedup(); - } - for ends in exon_ends.values_mut() { - ends.sort_unstable(); - ends.dedup(); + exon_starts.entry(seq_name).or_default().insert(start); + exon_ends.entry(seq_name).or_default().insert(end + 1); // TODO why +1? It works } debug!("Done reading GFF."); @@ -111,7 +98,6 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> { let mut results = JunctionAnnotationResults::default(); let params = compute::JunctionAnnotationParameters { min_intron_length: args.min_intron_length, - fuzzy_junction_match_range: args.fuzzy_junction_match_range, min_read_support: args.min_read_support, min_mapq: args.min_mapq, no_supplementary: args.no_supplementary, diff --git a/src/derive/junction_annotation/compute.rs b/src/derive/junction_annotation/compute.rs index 7ac4769..06b9de2 100644 --- a/src/derive/junction_annotation/compute.rs +++ b/src/derive/junction_annotation/compute.rs @@ -6,6 +6,7 @@ use noodles::sam::alignment::Record; use noodles::sam::record::cigar::op::Kind; use noodles::sam::Header; use std::collections::HashMap; +use std::collections::HashSet; use std::num::NonZeroUsize; use crate::derive::junction_annotation::results::JunctionAnnotationResults; @@ -15,11 +16,8 @@ pub struct JunctionAnnotationParameters { /// Minimum intron length to consider. pub min_intron_length: usize, - /// Add +- this amount to intron positions when looking up exon positions. - pub fuzzy_junction_match_range: u8, - /// Minimum number of reads supporting a junction to be considered. - pub min_read_support: u8, + pub min_read_support: usize, /// Minumum mapping quality for a record to be considered. /// 0 if MAPQ shouldn't be considered. @@ -38,8 +36,8 @@ pub struct JunctionAnnotationParameters { /// Main function to annotate junctions one record at a time. pub fn process( record: &Record, - exon_starts: &HashMap<&str, Vec>, - exon_ends: &HashMap<&str, Vec>, + exon_starts: &HashMap<&str, HashSet>, + exon_ends: &HashMap<&str, HashSet>, header: &Header, params: &JunctionAnnotationParameters, results: &mut JunctionAnnotationResults, @@ -177,28 +175,11 @@ pub fn process( let mut intron_start_known = false; let mut intron_end_known = false; - // To allow collapsing fuzzy junctions, - // we need to store the reference positions of the exon boundaries. - // We initialize these values to the position of the found intron. - let mut ref_intron_start = intron_start; - let mut ref_intron_end = intron_end; - for exon_end in exon_ends.iter() { - if intron_start >= (exon_end - params.fuzzy_junction_match_range as usize) - && intron_start <= (exon_end + params.fuzzy_junction_match_range as usize) - { - intron_start_known = true; - ref_intron_start = *exon_end; - break; - } + if exon_ends.contains(&intron_start) { + intron_start_known = true; } - for exon_start in exon_starts.iter() { - if intron_end >= (exon_start - params.fuzzy_junction_match_range as usize) - && intron_end <= (exon_start + params.fuzzy_junction_match_range as usize) - { - intron_end_known = true; - ref_intron_end = *exon_start; - break; - } + if exon_starts.contains(&intron_end) { + intron_end_known = true; } match (intron_start_known, intron_end_known) { @@ -211,8 +192,8 @@ pub fn process( .entry(seq_name.to_string()) .or_default() .entry(( - NonZeroUsize::new(ref_intron_start).unwrap(), - NonZeroUsize::new(ref_intron_end).unwrap(), + NonZeroUsize::new(intron_start).unwrap(), + NonZeroUsize::new(intron_end).unwrap(), )) .and_modify(|e| *e += 1) .or_insert(1); @@ -227,8 +208,8 @@ pub fn process( .entry(seq_name.to_string()) .or_default() .entry(( - NonZeroUsize::new(ref_intron_start).unwrap(), - NonZeroUsize::new(ref_intron_end).unwrap(), + NonZeroUsize::new(intron_start).unwrap(), + NonZeroUsize::new(intron_end).unwrap(), )) .and_modify(|e| *e += 1) .or_insert(1); @@ -242,8 +223,8 @@ pub fn process( .entry(seq_name.to_string()) .or_default() .entry(( - NonZeroUsize::new(ref_intron_start).unwrap(), - NonZeroUsize::new(ref_intron_end).unwrap(), + NonZeroUsize::new(intron_start).unwrap(), + NonZeroUsize::new(intron_end).unwrap(), )) .and_modify(|e| *e += 1) .or_insert(1); @@ -269,12 +250,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot v.retain(|(start, end), count| { if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; - if *count < params.min_read_support as usize { + if *count < params.min_read_support { num_not_enough_support += 1; } num_rejected += 1; false - } else if *count < params.min_read_support as usize { + } else if *count < params.min_read_support { num_not_enough_support += 1; if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; @@ -290,12 +271,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot v.retain(|(start, end), count| { if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; - if *count < params.min_read_support as usize { + if *count < params.min_read_support { num_not_enough_support += 1; } num_rejected += 1; false - } else if *count < params.min_read_support as usize { + } else if *count < params.min_read_support { num_not_enough_support += 1; if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; @@ -311,12 +292,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot v.retain(|(start, end), count| { if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; - if *count < params.min_read_support as usize { + if *count < params.min_read_support { num_not_enough_support += 1; } num_rejected += 1; false - } else if *count < params.min_read_support as usize { + } else if *count < params.min_read_support { num_not_enough_support += 1; if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; @@ -336,12 +317,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot v.retain(|(start, end), count| { if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; - if *count < params.min_read_support as usize { + if *count < params.min_read_support { num_not_enough_support += 1; } num_rejected += 1; false - } else if *count < params.min_read_support as usize { + } else if *count < params.min_read_support { num_not_enough_support += 1; if end.get() - start.get() < params.min_intron_length { num_junctions_too_short += 1; @@ -458,7 +439,6 @@ mod tests { let mut results = JunctionAnnotationResults::default(); let params = JunctionAnnotationParameters { min_intron_length: 10, - fuzzy_junction_match_range: 0, min_read_support: 2, min_mapq: 30, no_supplementary: false, @@ -476,12 +456,12 @@ mod tests { Map::::new(NonZeroUsize::try_from(400).unwrap()), ) .build(); - let exon_starts: HashMap<&str, Vec> = - HashMap::from([("sq1", vec![1, 11, 21, 31, 41, 51, 61, 71])]); + let exon_starts: HashMap<&str, HashSet> = + HashMap::from([("sq1", HashSet::from([1, 11, 21, 31, 41, 51, 61, 71]))]); let exon_ends = exon_starts .iter() .map(|(k, v)| (*k, v.iter().map(|e| e + 10).collect())) - .collect::>>(); + .collect::>>(); // Test known junction let mut record = Record::default();