From b67bac8e503c8459bcb4e9a786299d7d8359a05b Mon Sep 17 00:00:00 2001 From: Andrew Frantz Date: Sun, 11 Feb 2024 13:09:40 -0500 Subject: [PATCH] fix: lots of code cleanup --- src/derive/command/endedness.rs | 34 +++-- src/derive/command/readlen.rs | 2 +- src/derive/endedness.rs | 1 + src/derive/endedness/compute.rs | 160 +++------------------- src/derive/endedness/results.rs | 119 ++++++++++++++++ src/derive/junction_annotation/compute.rs | 57 ++++---- src/derive/junction_annotation/results.rs | 1 + src/derive/readlen/compute.rs | 25 ++-- src/derive/strandedness/compute.rs | 71 +++++----- src/utils/read_groups.rs | 4 +- 10 files changed, 238 insertions(+), 236 deletions(-) create mode 100644 src/derive/endedness/results.rs diff --git a/src/derive/command/endedness.rs b/src/derive/command/endedness.rs index 7532e06..a35b390 100644 --- a/src/derive/command/endedness.rs +++ b/src/derive/command/endedness.rs @@ -44,7 +44,7 @@ pub struct DeriveEndednessArgs { calc_rpt: bool, /// Round RPT to the nearest INT before comparing to expected values. - /// Appropriate if using `-n` > 0. + /// Appropriate if using `-n` > 0. Unrounded value is reported in output. #[arg(long, default_value = "false")] round_rpt: bool, } @@ -108,18 +108,26 @@ pub fn derive(args: DeriveEndednessArgs) -> anyhow::Result<()> { } } - if !record.flags().is_segmented() { - ordering_flags.entry(read_group).or_default().unsegmented += 1; - } else if record.flags().is_first_segment() && !record.flags().is_last_segment() { - ordering_flags.entry(read_group).or_default().first += 1; - } else if !record.flags().is_first_segment() && record.flags().is_last_segment() { - ordering_flags.entry(read_group).or_default().last += 1; - } else if record.flags().is_first_segment() && record.flags().is_last_segment() { - ordering_flags.entry(read_group).or_default().both += 1; - } else if !record.flags().is_first_segment() && !record.flags().is_last_segment() { - ordering_flags.entry(read_group).or_default().neither += 1; - } else { - unreachable!(); + match ( + record.flags().is_segmented(), + record.flags().is_first_segment(), + record.flags().is_last_segment(), + ) { + (false, _, _) => { + ordering_flags.entry(read_group).or_default().unsegmented += 1; + } + (true, true, false) => { + ordering_flags.entry(read_group).or_default().first += 1; + } + (true, false, true) => { + ordering_flags.entry(read_group).or_default().last += 1; + } + (true, true, true) => { + ordering_flags.entry(read_group).or_default().both += 1; + } + (true, false, false) => { + ordering_flags.entry(read_group).or_default().neither += 1; + } } counter.inc(); diff --git a/src/derive/command/readlen.rs b/src/derive/command/readlen.rs index ccaa1a6..e6b0c51 100644 --- a/src/derive/command/readlen.rs +++ b/src/derive/command/readlen.rs @@ -55,7 +55,7 @@ pub fn derive(args: DeriveReadlenArgs) -> anyhow::Result<()> { let record = result?; let len = record.sequence().len(); - read_lengths.entry(len).and_modify(|e| *e += 1).or_insert(1); + *read_lengths.entry(len).or_default() += 1; counter.inc(); if counter.time_to_break(&num_records) { diff --git a/src/derive/endedness.rs b/src/derive/endedness.rs index bf321e0..6f8d5cc 100644 --- a/src/derive/endedness.rs +++ b/src/derive/endedness.rs @@ -1,3 +1,4 @@ //! Supporting functionality for the `ngs derive endedness` subcommand. pub mod compute; +pub mod results; diff --git a/src/derive/endedness/compute.rs b/src/derive/endedness/compute.rs index 2b63d26..265fc2d 100644 --- a/src/derive/endedness/compute.rs +++ b/src/derive/endedness/compute.rs @@ -1,16 +1,16 @@ //! Module holding the logic for computing the endedness of a BAM. -use serde::Serialize; use std::collections::HashMap; use std::collections::HashSet; use std::ops::{Add, AddAssign}; use std::sync::Arc; use tracing::warn; +use crate::derive::endedness::results; use crate::utils::read_groups::ReadGroupPtr; /// Struct holding the ordering flags for a single read group. -#[derive(Debug, Clone)] +#[derive(Debug, Clone, Default)] pub struct OrderingFlagsCounts { /// The number of reads without 0x1 set. pub unsegmented: usize, @@ -40,12 +40,6 @@ impl OrderingFlagsCounts { } } -impl Default for OrderingFlagsCounts { - fn default() -> Self { - Self::new() - } -} - impl Add for OrderingFlagsCounts { type Output = Self; @@ -70,120 +64,6 @@ impl AddAssign for OrderingFlagsCounts { } } -/// Struct holding the per read group results for an `ngs derive endedness` -/// subcommand call. -#[derive(Debug, Serialize)] -pub struct ReadGroupDerivedEndednessResult { - /// Name of the read group. - pub read_group: String, - - /// Whether or not an endedness was determined for this read group. - pub succeeded: bool, - - /// The endedness of this read group or "Unknown". - pub endedness: String, - - /// The number of reads without 0x1 set. - pub unsegmented: usize, - - /// The f+l- read count. - pub first: usize, - - /// The f-l+ read count. - pub last: usize, - - /// The f+l+ read count. - pub both: usize, - - /// The f-l- read count. - pub neither: usize, - - /// The reads per template (RPT). - /// Only available if `args.calc_rpt` is true. - pub rpt: Option, -} - -impl ReadGroupDerivedEndednessResult { - /// Creates a new [`ReadGroupDerivedEndednessResult`]. - fn new( - read_group: String, - succeeded: bool, - endedness: String, - counts: OrderingFlagsCounts, - rpt: Option, - ) -> Self { - ReadGroupDerivedEndednessResult { - read_group, - succeeded, - endedness, - unsegmented: counts.unsegmented, - first: counts.first, - last: counts.last, - both: counts.both, - neither: counts.neither, - rpt, - } - } -} - -/// Struct holding the final results for an `ngs derive endedness` subcommand -/// call. -#[derive(Debug, Serialize)] -pub struct DerivedEndednessResult { - /// Whether or not the `ngs derive endedness` subcommand succeeded. - pub succeeded: bool, - - /// The overall endedness of the file or "Unknown". - pub endedness: String, - - /// The number of reads without 0x1 set. - pub unsegmented: usize, - - /// The overall f+l- read count. - pub first: usize, - - /// The overall f-l+ read count. - pub last: usize, - - /// The overall f+l+ read count. - pub both: usize, - - /// The overall f-l- read count. - pub neither: usize, - - /// The overall reads per template (RPT). - /// Only available if `args.calc_rpt` is true. - pub rpt: Option, - - /// Vector of [`ReadGroupDerivedEndednessResult`]s. - /// One for each read group in the BAM, - /// and potentially one for any reads with an unknown read group. - pub read_groups: Vec, -} - -impl DerivedEndednessResult { - /// Creates a new [`DerivedEndednessResult`]. - pub fn new( - succeeded: bool, - endedness: String, - counts: OrderingFlagsCounts, - rpt: Option, - read_groups: Vec, - ) -> Self { - DerivedEndednessResult { - succeeded, - endedness, - unsegmented: counts.unsegmented, - first: counts.first, - last: counts.last, - both: counts.both, - neither: counts.neither, - rpt, - read_groups, - } - } -} - /// Calculate the reads per template overall and for each read group. fn calculate_reads_per_template( read_names: HashMap>, @@ -204,17 +84,16 @@ fn calculate_reads_per_template( let read_group_set: HashSet = read_groups.iter().cloned().collect(); if read_group_set.len() == 1 { + // All found read groups assigned to this QNAME are the same. + // We assume this means all the reads came from the same template. let read_group = Arc::clone(read_group_set.iter().next().unwrap()); - read_group_reads - .entry(Arc::clone(&read_group)) - .and_modify(|e| *e += num_reads) - .or_insert(num_reads); - read_group_templates - .entry(read_group) - .and_modify(|e| *e += 1) - .or_insert(1); + *read_group_reads.entry(Arc::clone(&read_group)).or_default() += num_reads; + *read_group_templates.entry(read_group).or_default() += 1; } else { + // The QNAME is in multiple read groups. + // We assume this means the reads came from multiple templates. + // More specifically, we assume that exactly one template will originate from each read group. warning_count += 1; match warning_count { 1..=100 => { @@ -230,16 +109,10 @@ fn calculate_reads_per_template( } for read_group in read_groups { - read_group_reads - .entry(Arc::clone(read_group)) - .and_modify(|e| *e += 1) - .or_insert(1); + *read_group_reads.entry(Arc::clone(read_group)).or_default() += 1; } for read_group in read_group_set { - read_group_templates - .entry(read_group) - .and_modify(|e| *e += 1) - .or_insert(1); + *read_group_templates.entry(read_group).or_default() += 1; } } } @@ -268,7 +141,7 @@ fn predict_endedness( paired_deviance: f64, reads_per_template: Option, round_rpt: bool, -) -> ReadGroupDerivedEndednessResult { +) -> results::ReadGroupDerivedEndednessResult { let unsegmented = rg_ordering_flags.unsegmented; let first = rg_ordering_flags.first; let last = rg_ordering_flags.last; @@ -282,7 +155,7 @@ fn predict_endedness( "No reads were detected in this read group: {}", read_group_name ); - return ReadGroupDerivedEndednessResult::new( + return results::ReadGroupDerivedEndednessResult::new( read_group_name, false, "Unknown".to_string(), @@ -291,7 +164,7 @@ fn predict_endedness( ); } - let mut result = ReadGroupDerivedEndednessResult::new( + let mut result = results::ReadGroupDerivedEndednessResult::new( read_group_name, false, "Unknown".to_string(), @@ -331,6 +204,7 @@ fn predict_endedness( } // only both present if first == 0 && last == 0 && both > 0 && neither == 0 { + // Prior logic (before addition of unsegmented checks) left as comment for posterity // match reads_per_template { // Some(rpt) => { // if rpt == 1.0 || (round_rpt && rpt.round() as usize == 1) { @@ -389,7 +263,7 @@ pub fn predict( read_names: HashMap>, paired_deviance: f64, round_rpt: bool, -) -> DerivedEndednessResult { +) -> results::DerivedEndednessResult { let mut rg_rpts: HashMap = HashMap::new(); let mut overall_rpt: Option = None; if !read_names.is_empty() { @@ -420,7 +294,7 @@ pub fn predict( round_rpt, ); - DerivedEndednessResult::new( + results::DerivedEndednessResult::new( overall_result.succeeded, overall_result.endedness, overall_flags, diff --git a/src/derive/endedness/results.rs b/src/derive/endedness/results.rs new file mode 100644 index 0000000..aef11c9 --- /dev/null +++ b/src/derive/endedness/results.rs @@ -0,0 +1,119 @@ +//! Module holding the results structs for the `ngs derive endedness` subcommand. + +use serde::Serialize; + +use crate::derive::endedness::compute::OrderingFlagsCounts; + +/// Struct holding the per read group results for an `ngs derive endedness` +/// subcommand call. +#[derive(Debug, Serialize)] +pub struct ReadGroupDerivedEndednessResult { + /// Name of the read group. + pub read_group: String, + + /// Whether or not an endedness was determined for this read group. + pub succeeded: bool, + + /// The endedness of this read group or "Unknown". + pub endedness: String, + + /// The number of reads without 0x1 set. + pub unsegmented: usize, + + /// The f+l- read count. + pub first: usize, + + /// The f-l+ read count. + pub last: usize, + + /// The f+l+ read count. + pub both: usize, + + /// The f-l- read count. + pub neither: usize, + + /// The reads per template (RPT). + /// Only available if `args.calc_rpt` is true. + pub rpt: Option, +} + +impl ReadGroupDerivedEndednessResult { + /// Creates a new [`ReadGroupDerivedEndednessResult`]. + pub fn new( + read_group: String, + succeeded: bool, + endedness: String, + counts: OrderingFlagsCounts, + rpt: Option, + ) -> Self { + ReadGroupDerivedEndednessResult { + read_group, + succeeded, + endedness, + unsegmented: counts.unsegmented, + first: counts.first, + last: counts.last, + both: counts.both, + neither: counts.neither, + rpt, + } + } +} + +/// Struct holding the final results for an `ngs derive endedness` subcommand +/// call. +#[derive(Debug, Serialize)] +pub struct DerivedEndednessResult { + /// Whether or not the `ngs derive endedness` subcommand succeeded. + pub succeeded: bool, + + /// The overall endedness of the file or "Unknown". + pub endedness: String, + + /// The number of reads without 0x1 set. + pub unsegmented: usize, + + /// The overall f+l- read count. + pub first: usize, + + /// The overall f-l+ read count. + pub last: usize, + + /// The overall f+l+ read count. + pub both: usize, + + /// The overall f-l- read count. + pub neither: usize, + + /// The overall reads per template (RPT). + /// Only available if `args.calc_rpt` is true. + pub rpt: Option, + + /// Vector of [`ReadGroupDerivedEndednessResult`]s. + /// One for each read group in the BAM, + /// and potentially one for any reads with an unknown read group. + pub read_groups: Vec, +} + +impl DerivedEndednessResult { + /// Creates a new [`DerivedEndednessResult`]. + pub fn new( + succeeded: bool, + endedness: String, + counts: OrderingFlagsCounts, + rpt: Option, + read_groups: Vec, + ) -> Self { + DerivedEndednessResult { + succeeded, + endedness, + unsegmented: counts.unsegmented, + first: counts.first, + last: counts.last, + both: counts.both, + neither: counts.neither, + rpt, + read_groups, + } + } +} diff --git a/src/derive/junction_annotation/compute.rs b/src/derive/junction_annotation/compute.rs index 3a67b9b..069a5dc 100644 --- a/src/derive/junction_annotation/compute.rs +++ b/src/derive/junction_annotation/compute.rs @@ -142,7 +142,7 @@ pub fn process( } // (7) Calculate the start position of this read. This will - // later be used to find the position of any introns. + // be used to find the position of any introns. let start = match record.alignment_start() { Some(s) => s, _ => bail!("Could not parse record's start position."), @@ -155,6 +155,7 @@ pub fn process( // This is an intron. Kind::Skip => { // Check that `op.len() >= params.min_intron_length` later, + // once all reads supporting short junctions have been collected // for better metric reporting. let intron_start = cur_pos; @@ -175,14 +176,10 @@ pub fn process( continue; } - let exon_starts = match exons.starts.get(seq_name) { - Some(starts) => starts, - _ => bail!("Could not find exon starts for contig: {}", seq_name), - }; - let exon_ends = match exons.ends.get(seq_name) { - Some(ends) => ends, - _ => bail!("Could not find exon ends for contig: {}", seq_name), - }; + // The following unwraps are safe because we checked that the reference + // sequence is annotated above. + let exon_starts = exons.starts.get(seq_name).unwrap(); + let exon_ends = exons.ends.get(seq_name).unwrap(); let mut intron_start_known = false; let mut intron_end_known = false; @@ -299,27 +296,29 @@ pub fn summarize( ); // Tally up observed junctions and spliced reads. - let mut juncs; - let mut support; - (juncs, support) = tally_junctions_and_support(&results.junction_annotations.known); - results.summary.known_junctions = juncs; - results.summary.known_junctions_read_support = support; - (juncs, support) = tally_junctions_and_support(&results.junction_annotations.partial_novel); - results.summary.partial_novel_junctions = juncs; - results.summary.partial_novel_junctions_read_support = support; - (juncs, support) = tally_junctions_and_support(&results.junction_annotations.complete_novel); - results.summary.complete_novel_junctions = juncs; - results.summary.complete_novel_junctions_read_support = support; - (juncs, support) = - tally_junctions_and_support(&results.junction_annotations.unannotated_reference); - results.summary.unannotated_reference_junctions = juncs; - results.summary.unannotated_reference_junctions_read_support = support; - - // Tally up total junctions and spliced reads. + ( + results.summary.known_junctions, + results.summary.known_junctions_read_support, + ) = tally_junctions_and_support(&results.junction_annotations.known); + ( + results.summary.partial_novel_junctions, + results.summary.partial_novel_junctions_read_support, + ) = tally_junctions_and_support(&results.junction_annotations.partial_novel); + ( + results.summary.complete_novel_junctions, + results.summary.complete_novel_junctions_read_support, + ) = tally_junctions_and_support(&results.junction_annotations.complete_novel); + ( + results.summary.unannotated_reference_junctions, + results.summary.unannotated_reference_junctions_read_support, + ) = tally_junctions_and_support(&results.junction_annotations.unannotated_reference); + + // Tally up total junctions. results.summary.total_junctions = results.summary.known_junctions + results.summary.partial_novel_junctions + results.summary.complete_novel_junctions + results.summary.unannotated_reference_junctions; + // Tally up total read support. results.summary.total_junctions_read_support = results.summary.known_junctions_read_support + results.summary.partial_novel_junctions_read_support + results.summary.complete_novel_junctions_read_support @@ -327,7 +326,7 @@ pub fn summarize( // Calculate percentages. let total_junctions = results.summary.total_junctions as f64 - - results.summary.unannotated_reference_junctions as f64; + - results.summary.unannotated_reference_junctions as f64; // exclude unannotated junctions from percentages results.summary.known_junctions_percent = results.summary.known_junctions as f64 / total_junctions * 100.0; results.summary.partial_novel_junctions_percent = @@ -336,15 +335,19 @@ pub fn summarize( results.summary.complete_novel_junctions as f64 / total_junctions * 100.0; // Calculate average read support. + // Total results.summary.average_junction_read_support = results.summary.total_junctions_read_support as f64 / results.summary.total_junctions as f64; + // Known results.summary.average_known_junction_read_support = results.summary.known_junctions_read_support as f64 / results.summary.known_junctions as f64; + // Partial Novel results.summary.average_partial_novel_junction_read_support = results.summary.partial_novel_junctions_read_support as f64 / results.summary.partial_novel_junctions as f64; + // Complete Novel results.summary.average_complete_novel_junction_read_support = results.summary.complete_novel_junctions_read_support as f64 / results.summary.complete_novel_junctions as f64; diff --git a/src/derive/junction_annotation/results.rs b/src/derive/junction_annotation/results.rs index ced00da..29daf20 100644 --- a/src/derive/junction_annotation/results.rs +++ b/src/derive/junction_annotation/results.rs @@ -41,6 +41,7 @@ pub struct JunctionAnnotations { pub unannotated_reference: JunctionsMap, } +// TODO should contigs be sorted? impl Serialize for JunctionAnnotations { fn serialize(&self, serializer: S) -> Result { let mut known = Vec::new(); diff --git a/src/derive/readlen/compute.rs b/src/derive/readlen/compute.rs index 28b6f97..2867dbb 100644 --- a/src/derive/readlen/compute.rs +++ b/src/derive/readlen/compute.rs @@ -17,8 +17,8 @@ pub struct DerivedReadlenResult { /// The majority vote percentage of the consensus read length. pub majority_pct_detected: f64, - /// Status of the evidence that supports (or does not support) this - /// read length. + /// Status of the evidence that supports (or does not support) the + /// consensus read length. pub evidence: Vec<(usize, usize)>, } @@ -61,15 +61,20 @@ pub fn predict( let consensus_read_length = max_read_length; let majority_detected = max_count as f64 / num_samples as f64; - let mut result = - DerivedReadlenResult::new(false, None, majority_detected * 100.0, read_lengths); - - if majority_detected >= majority_vote_cutoff { - result.succeeded = true; - result.consensus_read_length = Some(consensus_read_length); + match majority_detected >= majority_vote_cutoff { + true => anyhow::Ok(DerivedReadlenResult::new( + true, + Some(consensus_read_length), + majority_detected * 100.0, + read_lengths, + )), + false => anyhow::Ok(DerivedReadlenResult::new( + false, + None, + majority_detected * 100.0, + read_lengths, + )), } - - anyhow::Ok(result) } #[cfg(test)] diff --git a/src/derive/strandedness/compute.rs b/src/derive/strandedness/compute.rs index a880467..f2cb477 100644 --- a/src/derive/strandedness/compute.rs +++ b/src/derive/strandedness/compute.rs @@ -1,22 +1,16 @@ //! Module holding the logic for computing the strandedness. -use noodles::bam; -use noodles::core::Region; -use noodles::gff; -use noodles::sam; -use noodles::sam::record::data::field::Tag; use noodles::sam::record::MappingQuality; +use noodles::{bam, core::Region, gff, sam}; use rand::Rng; use rust_lapper::Lapper; -use std::collections::HashMap; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::ops::{Add, AddAssign}; use std::sync::Arc; use crate::derive::strandedness::results; -use crate::utils::alignment::filter_by_mapq; -use crate::utils::display::RecordCounter; -use crate::utils::read_groups::UNKNOWN_READ_GROUP; +use crate::utils::read_groups; +use crate::utils::{alignment::filter_by_mapq, display::RecordCounter}; const STRANDED_THRESHOLD: f64 = 80.0; const UNSTRANDED_THRESHOLD: f64 = 40.0; @@ -49,7 +43,7 @@ impl AddAssign for Counts { } } -/// Struct for possible (valid) strand orientations. +/// Struct for valid strand orientations. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum Strand { /// Forward strand. @@ -84,7 +78,10 @@ impl TryFrom for Strand { /// Struct for tracking the order of segments in a record. #[derive(Clone, Copy, Debug)] enum SegmentOrder { + /// The first segment in a record. First, + + /// The last segment in a record. Last, } @@ -105,6 +102,8 @@ impl TryFrom for SegmentOrder { } /// Struct holding the parsed BAM file and its index. +/// TODO this code is repeated. Should be in a common module. +/// Will be moved to utils::formats in a future PR. pub struct ParsedBAMFile { /// The BAM reader. pub reader: bam::Reader>, @@ -131,7 +130,7 @@ pub struct StrandednessParams { /// The number of genes to test for strandedness. pub num_genes: usize, - /// The maximum number of iterations to try before giving up. + /// The maximum number of genes to try before giving up. pub max_genes_per_try: usize, /// Minimum number of reads mapped to a gene to be considered @@ -139,7 +138,7 @@ pub struct StrandednessParams { pub min_reads_per_gene: usize, /// Minumum mapping quality for a record to be considered. - /// `None` means no filtering by MAPQ. This also allows + /// `None` means no filtering by MAPQ. This allows /// for records _without_ a MAPQ to be counted. pub min_mapq: Option, @@ -156,22 +155,23 @@ pub struct StrandednessParams { pub count_duplicates: bool, } -/// Function to disqualify a gene based on its strand and exons. +/// Function to disqualify a gene based on its strand and its exons' strand. fn disqualify_gene(gene: &gff::Record, exons: &HashMap<&str, Lapper>) -> bool { // gene_strand guaranteed to be Forward or Reverse by initialization code. let gene_strand = Strand::try_from(gene.strand()).unwrap(); let mut all_on_same_strand = true; - let mut at_least_one_exon = false; - - if let Some(intervals) = exons.get(gene.reference_sequence_name()) { - for exon in intervals.find(gene.start().into(), gene.end().into()) { - at_least_one_exon = true; - if exon.val != gene_strand { - all_on_same_strand = false; - break; - } - } - } + + let at_least_one_exon = match exons.get(gene.reference_sequence_name()) { + Some(intervals) => intervals + .find(gene.start().into(), gene.end().into()) + .any(|exon| { + if exon.val != gene_strand { + all_on_same_strand = false; + } + true + }), + None => false, + }; if all_on_same_strand && at_least_one_exon { return false; @@ -242,16 +242,7 @@ fn classify_read( all_counts: &mut AllReadGroupsCounts, read_metrics: &mut results::ReadRecordMetrics, ) { - let read_group = match read.data().get(Tag::ReadGroup) { - Some(rg) => { - let rg = rg.to_string(); - if !all_counts.found_rgs.contains(&rg) { - all_counts.found_rgs.insert(Arc::new(rg.clone())); - } - Arc::clone(all_counts.found_rgs.get(&rg).unwrap()) - } - None => Arc::clone(&UNKNOWN_READ_GROUP), - }; + let read_group = read_groups::get_read_group(read, Some(&mut all_counts.found_rgs)); let rg_counts = all_counts.counts.entry(read_group).or_default(); @@ -324,7 +315,7 @@ pub fn predict_strandedness( { result.succeeded = true; result.strandedness = "Unstranded".to_string(); - } + } // else Inconclusive result } @@ -342,7 +333,7 @@ pub fn predict( ) -> Result { let mut rng = rand::thread_rng(); let mut num_genes_considered = 0; // Local to this attempt - let mut counter = RecordCounter::new(Some(1_000)); + let mut counter = RecordCounter::new(Some(1_000)); // Also local to this attempt let genes_remaining = gene_records.len(); let max_iters = if params.max_genes_per_try > genes_remaining { @@ -366,12 +357,13 @@ pub fn predict( } let cur_gene = gene_records.swap_remove(rng.gen_range(0..gene_records.len())); - counter.inc(); + counter.inc(); // Technically this is off-by-one, as the gene hasn't been processed yet. if disqualify_gene(&cur_gene, exons) { metrics.genes.mixed_strands += 1; // Tracked across attempts continue; } + // gene_strand guaranteed to be Forward or Reverse by initialization code. let cur_gene_strand = Strand::try_from(cur_gene.strand()).unwrap(); let mut enough_reads = false; @@ -388,7 +380,7 @@ pub fn predict( } if num_genes_considered < params.num_genes { tracing::warn!( - "Evaluated the maximum number of genes ({}) before considering the requested amount of genes ({}) for this try. Only considering {} genes.", + "Evaluated the maximum number of genes ({}) before considering the requested amount of genes ({}) for this try. Only considering an additional {} genes this try.", max_iters, params.num_genes, num_genes_considered, @@ -423,6 +415,7 @@ pub fn predict( #[cfg(test)] mod tests { use super::*; + use noodles::sam::record::data::field::Tag; use rust_lapper::Interval; #[test] diff --git a/src/utils/read_groups.rs b/src/utils/read_groups.rs index 7cd0b57..86caf5d 100644 --- a/src/utils/read_groups.rs +++ b/src/utils/read_groups.rs @@ -11,10 +11,8 @@ use tracing::warn; /// Type alias for a read group pointer. pub type ReadGroupPtr = Arc; -// Strings used to index into the HashMaps used to store the Read Group ordering flags. -// Lazy statics are used to save memory. lazy_static! { - /// String used to index into th HashMaps used to store the "unknown_read_group" ordering flags. + /// String used to represent an unknown read group. Wrapped in an Arc to prevent redundant memory usage. pub static ref UNKNOWN_READ_GROUP: ReadGroupPtr = Arc::new(String::from("unknown_read_group")); }