Skip to content

Commit

Permalink
feat: remove fuzzy searching ability. Boosts performance as well.
Browse files Browse the repository at this point in the history
  • Loading branch information
a-frantz committed Jan 10, 2024
1 parent 65c4935 commit 6fcbd60
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 66 deletions.
28 changes: 7 additions & 21 deletions src/derive/command/junction_annotation.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
//! Functionality relating to the `ngs derive junction_annotation` subcommand itself.
use std::collections::HashMap;
use std::collections::HashSet;
use std::path::PathBuf;

use anyhow::Context;
Expand Down Expand Up @@ -37,13 +38,9 @@ pub struct JunctionAnnotationArgs {
#[arg(short = 'i', long, value_name = "USIZE", default_value = "50")]
pub min_intron_length: usize,

/// Add +- this amount to intron positions when looking up exon positions.
#[arg(short = 'k', long, value_name = "U8", default_value = "0")]
pub fuzzy_junction_match_range: u8,

/// Minimum number of reads supporting a junction to be considered.
#[arg(short = 'r', long, value_name = "U8", default_value = "2")]
pub min_read_support: u8,
#[arg(short = 'r', long, value_name = "usize", default_value = "2")]
pub min_read_support: usize,

/// Minumum mapping quality for a record to be considered.
/// Set to 0 to disable this filter and allow reads _without_
Expand All @@ -68,8 +65,8 @@ pub struct JunctionAnnotationArgs {
pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
info!("Starting derive junction_annotation subcommand.");

let mut exon_starts: HashMap<&str, Vec<usize>> = HashMap::new();
let mut exon_ends: HashMap<&str, Vec<usize>> = HashMap::new();
let mut exon_starts: HashMap<&str, HashSet<usize>> = HashMap::new();
let mut exon_ends: HashMap<&str, HashSet<usize>> = HashMap::new();

// (1) Parse the GFF file and collect all exon features.
debug!("Reading all records in GFF.");
Expand All @@ -91,18 +88,8 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
let start: usize = record.start().into();
let end: usize = record.end().into();

exon_starts.entry(seq_name).or_default().push(start);
exon_ends.entry(seq_name).or_default().push(end + 1); // TODO why +1? It works
}

debug!("Finalizing GFF features lookup.");
for starts in exon_starts.values_mut() {
starts.sort_unstable();
starts.dedup();
}
for ends in exon_ends.values_mut() {
ends.sort_unstable();
ends.dedup();
exon_starts.entry(seq_name).or_default().insert(start);
exon_ends.entry(seq_name).or_default().insert(end + 1); // TODO why +1? It works
}

debug!("Done reading GFF.");
Expand All @@ -111,7 +98,6 @@ pub fn derive(args: JunctionAnnotationArgs) -> anyhow::Result<()> {
let mut results = JunctionAnnotationResults::default();
let params = compute::JunctionAnnotationParameters {
min_intron_length: args.min_intron_length,
fuzzy_junction_match_range: args.fuzzy_junction_match_range,
min_read_support: args.min_read_support,
min_mapq: args.min_mapq,
no_supplementary: args.no_supplementary,
Expand Down
70 changes: 25 additions & 45 deletions src/derive/junction_annotation/compute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use noodles::sam::alignment::Record;
use noodles::sam::record::cigar::op::Kind;
use noodles::sam::Header;
use std::collections::HashMap;
use std::collections::HashSet;
use std::num::NonZeroUsize;

use crate::derive::junction_annotation::results::JunctionAnnotationResults;
Expand All @@ -15,11 +16,8 @@ pub struct JunctionAnnotationParameters {
/// Minimum intron length to consider.
pub min_intron_length: usize,

/// Add +- this amount to intron positions when looking up exon positions.
pub fuzzy_junction_match_range: u8,

/// Minimum number of reads supporting a junction to be considered.
pub min_read_support: u8,
pub min_read_support: usize,

/// Minumum mapping quality for a record to be considered.
/// 0 if MAPQ shouldn't be considered.
Expand All @@ -38,8 +36,8 @@ pub struct JunctionAnnotationParameters {
/// Main function to annotate junctions one record at a time.
pub fn process(
record: &Record,
exon_starts: &HashMap<&str, Vec<usize>>,
exon_ends: &HashMap<&str, Vec<usize>>,
exon_starts: &HashMap<&str, HashSet<usize>>,
exon_ends: &HashMap<&str, HashSet<usize>>,
header: &Header,
params: &JunctionAnnotationParameters,
results: &mut JunctionAnnotationResults,
Expand Down Expand Up @@ -177,28 +175,11 @@ pub fn process(

let mut intron_start_known = false;
let mut intron_end_known = false;
// To allow collapsing fuzzy junctions,
// we need to store the reference positions of the exon boundaries.
// We initialize these values to the position of the found intron.
let mut ref_intron_start = intron_start;
let mut ref_intron_end = intron_end;
for exon_end in exon_ends.iter() {
if intron_start >= (exon_end - params.fuzzy_junction_match_range as usize)
&& intron_start <= (exon_end + params.fuzzy_junction_match_range as usize)
{
intron_start_known = true;
ref_intron_start = *exon_end;
break;
}
if exon_ends.contains(&intron_start) {
intron_start_known = true;
}
for exon_start in exon_starts.iter() {
if intron_end >= (exon_start - params.fuzzy_junction_match_range as usize)
&& intron_end <= (exon_start + params.fuzzy_junction_match_range as usize)
{
intron_end_known = true;
ref_intron_end = *exon_start;
break;
}
if exon_starts.contains(&intron_end) {
intron_end_known = true;
}

match (intron_start_known, intron_end_known) {
Expand All @@ -211,8 +192,8 @@ pub fn process(
.entry(seq_name.to_string())
.or_default()
.entry((
NonZeroUsize::new(ref_intron_start).unwrap(),
NonZeroUsize::new(ref_intron_end).unwrap(),
NonZeroUsize::new(intron_start).unwrap(),
NonZeroUsize::new(intron_end).unwrap(),
))
.and_modify(|e| *e += 1)
.or_insert(1);
Expand All @@ -227,8 +208,8 @@ pub fn process(
.entry(seq_name.to_string())
.or_default()
.entry((
NonZeroUsize::new(ref_intron_start).unwrap(),
NonZeroUsize::new(ref_intron_end).unwrap(),
NonZeroUsize::new(intron_start).unwrap(),
NonZeroUsize::new(intron_end).unwrap(),
))
.and_modify(|e| *e += 1)
.or_insert(1);
Expand All @@ -242,8 +223,8 @@ pub fn process(
.entry(seq_name.to_string())
.or_default()
.entry((
NonZeroUsize::new(ref_intron_start).unwrap(),
NonZeroUsize::new(ref_intron_end).unwrap(),
NonZeroUsize::new(intron_start).unwrap(),
NonZeroUsize::new(intron_end).unwrap(),
))
.and_modify(|e| *e += 1)
.or_insert(1);
Expand All @@ -269,12 +250,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot
v.retain(|(start, end), count| {
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
if *count < params.min_read_support as usize {
if *count < params.min_read_support {
num_not_enough_support += 1;
}
num_rejected += 1;
false
} else if *count < params.min_read_support as usize {
} else if *count < params.min_read_support {
num_not_enough_support += 1;
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
Expand All @@ -290,12 +271,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot
v.retain(|(start, end), count| {
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
if *count < params.min_read_support as usize {
if *count < params.min_read_support {
num_not_enough_support += 1;
}
num_rejected += 1;
false
} else if *count < params.min_read_support as usize {
} else if *count < params.min_read_support {
num_not_enough_support += 1;
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
Expand All @@ -311,12 +292,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot
v.retain(|(start, end), count| {
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
if *count < params.min_read_support as usize {
if *count < params.min_read_support {
num_not_enough_support += 1;
}
num_rejected += 1;
false
} else if *count < params.min_read_support as usize {
} else if *count < params.min_read_support {
num_not_enough_support += 1;
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
Expand All @@ -336,12 +317,12 @@ pub fn summarize(results: &mut JunctionAnnotationResults, params: &JunctionAnnot
v.retain(|(start, end), count| {
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
if *count < params.min_read_support as usize {
if *count < params.min_read_support {
num_not_enough_support += 1;
}
num_rejected += 1;
false
} else if *count < params.min_read_support as usize {
} else if *count < params.min_read_support {
num_not_enough_support += 1;
if end.get() - start.get() < params.min_intron_length {
num_junctions_too_short += 1;
Expand Down Expand Up @@ -458,7 +439,6 @@ mod tests {
let mut results = JunctionAnnotationResults::default();
let params = JunctionAnnotationParameters {
min_intron_length: 10,
fuzzy_junction_match_range: 0,
min_read_support: 2,
min_mapq: 30,
no_supplementary: false,
Expand All @@ -476,12 +456,12 @@ mod tests {
Map::<ReferenceSequence>::new(NonZeroUsize::try_from(400).unwrap()),
)
.build();
let exon_starts: HashMap<&str, Vec<usize>> =
HashMap::from([("sq1", vec![1, 11, 21, 31, 41, 51, 61, 71])]);
let exon_starts: HashMap<&str, HashSet<usize>> =
HashMap::from([("sq1", HashSet::from([1, 11, 21, 31, 41, 51, 61, 71]))]);
let exon_ends = exon_starts
.iter()
.map(|(k, v)| (*k, v.iter().map(|e| e + 10).collect()))
.collect::<HashMap<&str, Vec<usize>>>();
.collect::<HashMap<&str, HashSet<usize>>>();

// Test known junction
let mut record = Record::default();
Expand Down

0 comments on commit 6fcbd60

Please sign in to comment.